github.com/zxy12/golang151_with_comment@v0.0.0-20190507085033-721809559d3c/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	CMPQ	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  
    46  	MOVQ	$1, AX
    47  	CPUID
    48  	MOVL	CX, runtime·cpuid_ecx(SB)
    49  	MOVL	DX, runtime·cpuid_edx(SB)
    50  nocpuinfo:	
    51  	
    52  	// if there is an _cgo_init, call it.
    53  	MOVQ	_cgo_init(SB), AX
    54  	TESTQ	AX, AX
    55  	JZ	needtls
    56  	// g0 already in DI
    57  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    58  	MOVQ	$setg_gcc<>(SB), SI
    59  	CALL	AX
    60  
    61  	// update stackguard after _cgo_init
    62  	MOVQ	$runtime·g0(SB), CX
    63  	MOVQ	(g_stack+stack_lo)(CX), AX
    64  	ADDQ	$const__StackGuard, AX
    65  	MOVQ	AX, g_stackguard0(CX)
    66  	MOVQ	AX, g_stackguard1(CX)
    67  
    68  	CMPL	runtime·iswindows(SB), $0
    69  	JEQ ok
    70  needtls:
    71  	// skip TLS setup on Plan 9
    72  	CMPL	runtime·isplan9(SB), $1
    73  	JEQ ok
    74  	// skip TLS setup on Solaris
    75  	CMPL	runtime·issolaris(SB), $1
    76  	JEQ ok
    77  
    78  	LEAQ	runtime·tls0(SB), DI
    79  	CALL	runtime·settls(SB)
    80  
    81  	// store through it, to make sure it works
    82  	get_tls(BX)
    83  	MOVQ	$0x123, g(BX)
    84  	MOVQ	runtime·tls0(SB), AX
    85  	CMPQ	AX, $0x123
    86  	JEQ 2(PC)
    87  	MOVL	AX, 0	// abort
    88  ok:
    89  	// set the per-goroutine and per-mach "registers"
    90  	get_tls(BX)
    91  	LEAQ	runtime·g0(SB), CX
    92  	MOVQ	CX, g(BX)
    93  	LEAQ	runtime·m0(SB), AX
    94  
    95  	// save m->g0 = g0
    96  	MOVQ	CX, m_g0(AX)
    97  	// save m0 to g0->m
    98  	MOVQ	AX, g_m(CX)
    99  
   100  	CLD				// convention is D is always left cleared
   101  	CALL	runtime·check(SB)
   102  
   103  	MOVL	16(SP), AX		// copy argc
   104  	MOVL	AX, 0(SP)
   105  	MOVQ	24(SP), AX		// copy argv
   106  	MOVQ	AX, 8(SP)
   107  	CALL	runtime·args(SB)
   108  	CALL	runtime·osinit(SB)
   109  	CALL	runtime·schedinit(SB)
   110  
   111  	// create a new goroutine to start program
   112  	MOVQ	$runtime·mainPC(SB), AX		// entry
   113  	PUSHQ	AX
   114  	PUSHQ	$0			// arg size
   115  	CALL	runtime·newproc(SB)
   116  	POPQ	AX
   117  	POPQ	AX
   118  
   119  	// start this M
   120  	CALL	runtime·mstart(SB)
   121  
   122  	MOVL	$0xf1, 0xf1  // crash
   123  	RET
   124  
   125  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   126  GLOBL	runtime·mainPC(SB),RODATA,$8
   127  
   128  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   129  	BYTE	$0xcc
   130  	RET
   131  
   132  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   133  	// No per-thread init.
   134  	RET
   135  
   136  /*
   137   *  go-routine
   138   */
   139  
   140  // void gosave(Gobuf*)
   141  // save state in Gobuf; setjmp
   142  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   143  	MOVQ	buf+0(FP), AX		// gobuf
   144  	LEAQ	buf+0(FP), BX		// caller's SP
   145  	MOVQ	BX, gobuf_sp(AX)
   146  	MOVQ	0(SP), BX		// caller's PC
   147  	MOVQ	BX, gobuf_pc(AX)
   148  	MOVQ	$0, gobuf_ret(AX)
   149  	MOVQ	$0, gobuf_ctxt(AX)
   150  	MOVQ	BP, gobuf_bp(AX)
   151  	get_tls(CX)
   152  	MOVQ	g(CX), BX
   153  	MOVQ	BX, gobuf_g(AX)
   154  	RET
   155  
   156  // void gogo(Gobuf*)
   157  // restore state from Gobuf; longjmp
   158  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   159  	MOVQ	buf+0(FP), BX		// gobuf
   160  	MOVQ	gobuf_g(BX), DX
   161  	MOVQ	0(DX), CX		// make sure g != nil
   162  	get_tls(CX)
   163  	MOVQ	DX, g(CX)
   164  	MOVQ	gobuf_sp(BX), SP	// restore SP
   165  	MOVQ	gobuf_ret(BX), AX
   166  	MOVQ	gobuf_ctxt(BX), DX
   167  	MOVQ	gobuf_bp(BX), BP
   168  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   169  	MOVQ	$0, gobuf_ret(BX)
   170  	MOVQ	$0, gobuf_ctxt(BX)
   171  	MOVQ	$0, gobuf_bp(BX)
   172  	MOVQ	gobuf_pc(BX), BX
   173  	JMP	BX
   174  
   175  // func mcall(fn func(*g))
   176  // Switch to m->g0's stack, call fn(g).
   177  // Fn must never return.  It should gogo(&g->sched)
   178  // to keep running g.
   179  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   180  	MOVQ	fn+0(FP), DI
   181  	
   182  	get_tls(CX)
   183  	MOVQ	g(CX), AX	// save state in g->sched
   184  	MOVQ	0(SP), BX	// caller's PC
   185  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   186  	LEAQ	fn+0(FP), BX	// caller's SP
   187  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   188  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   189  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   190  
   191  	// switch to m->g0 & its stack, call fn
   192  	MOVQ	g(CX), BX
   193  	MOVQ	g_m(BX), BX
   194  	MOVQ	m_g0(BX), SI
   195  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   196  	JNE	3(PC)
   197  	MOVQ	$runtime·badmcall(SB), AX
   198  	JMP	AX
   199  	MOVQ	SI, g(CX)	// g = m->g0
   200  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   201  	PUSHQ	AX
   202  	MOVQ	DI, DX
   203  	MOVQ	0(DI), DI
   204  	CALL	DI
   205  	POPQ	AX
   206  	MOVQ	$runtime·badmcall2(SB), AX
   207  	JMP	AX
   208  	RET
   209  
   210  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   211  // of the G stack.  We need to distinguish the routine that
   212  // lives at the bottom of the G stack from the one that lives
   213  // at the top of the system stack because the one at the top of
   214  // the system stack terminates the stack walk (see topofstack()).
   215  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   216  	RET
   217  
   218  // func systemstack(fn func())
   219  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   220  	MOVQ	fn+0(FP), DI	// DI = fn
   221  	get_tls(CX)
   222  	MOVQ	g(CX), AX	// AX = g
   223  	MOVQ	g_m(AX), BX	// BX = m
   224  
   225  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   226  	CMPQ	AX, DX
   227  	JEQ	noswitch
   228  
   229  	MOVQ	m_g0(BX), DX	// DX = g0
   230  	CMPQ	AX, DX
   231  	JEQ	noswitch
   232  
   233  	MOVQ	m_curg(BX), R8
   234  	CMPQ	AX, R8
   235  	JEQ	switch
   236  	
   237  	// Bad: g is not gsignal, not g0, not curg. What is it?
   238  	MOVQ	$runtime·badsystemstack(SB), AX
   239  	CALL	AX
   240  
   241  switch:
   242  	// save our state in g->sched.  Pretend to
   243  	// be systemstack_switch if the G stack is scanned.
   244  	MOVQ	$runtime·systemstack_switch(SB), SI
   245  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   246  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   247  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   248  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   249  
   250  	// switch to g0
   251  	MOVQ	DX, g(CX)
   252  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   253  	// make it look like mstart called systemstack on g0, to stop traceback
   254  	SUBQ	$8, BX
   255  	MOVQ	$runtime·mstart(SB), DX
   256  	MOVQ	DX, 0(BX)
   257  	MOVQ	BX, SP
   258  
   259  	// call target function
   260  	MOVQ	DI, DX
   261  	MOVQ	0(DI), DI
   262  	CALL	DI
   263  
   264  	// switch back to g
   265  	get_tls(CX)
   266  	MOVQ	g(CX), AX
   267  	MOVQ	g_m(AX), BX
   268  	MOVQ	m_curg(BX), AX
   269  	MOVQ	AX, g(CX)
   270  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   271  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   272  	RET
   273  
   274  noswitch:
   275  	// already on m stack, just call directly
   276  	MOVQ	DI, DX
   277  	MOVQ	0(DI), DI
   278  	CALL	DI
   279  	RET
   280  
   281  /*
   282   * support for morestack
   283   */
   284  
   285  // Called during function prolog when more stack is needed.
   286  //
   287  // The traceback routines see morestack on a g0 as being
   288  // the top of a stack (for example, morestack calling newstack
   289  // calling the scheduler calling newm calling gc), so we must
   290  // record an argument size. For that purpose, it has no arguments.
   291  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   292  	// Cannot grow scheduler stack (m->g0).
   293  	get_tls(CX)
   294  	MOVQ	g(CX), BX
   295  	MOVQ	g_m(BX), BX
   296  	MOVQ	m_g0(BX), SI
   297  	CMPQ	g(CX), SI
   298  	JNE	2(PC)
   299  	INT	$3
   300  
   301  	// Cannot grow signal stack (m->gsignal).
   302  	MOVQ	m_gsignal(BX), SI
   303  	CMPQ	g(CX), SI
   304  	JNE	2(PC)
   305  	INT	$3
   306  
   307  	// Called from f.
   308  	// Set m->morebuf to f's caller.
   309  	MOVQ	8(SP), AX	// f's caller's PC
   310  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   311  	LEAQ	16(SP), AX	// f's caller's SP
   312  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   313  	get_tls(CX)
   314  	MOVQ	g(CX), SI
   315  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   316  
   317  	// Set g->sched to context in f.
   318  	MOVQ	0(SP), AX // f's PC
   319  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   320  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   321  	LEAQ	8(SP), AX // f's SP
   322  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   323  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   324  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   325  
   326  	// Call newstack on m->g0's stack.
   327  	MOVQ	m_g0(BX), BX
   328  	MOVQ	BX, g(CX)
   329  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   330  	CALL	runtime·newstack(SB)
   331  	MOVQ	$0, 0x1003	// crash if newstack returns
   332  	RET
   333  
   334  // morestack but not preserving ctxt.
   335  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   336  	MOVL	$0, DX
   337  	JMP	runtime·morestack(SB)
   338  
   339  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   340  	// We came here via a RET to an overwritten return PC.
   341  	// AX may be live. Other registers are available.
   342  
   343  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   344  	get_tls(CX)
   345  	MOVQ	g(CX), CX
   346  	MOVQ	(g_stkbar+slice_array)(CX), DX
   347  	MOVQ	g_stkbarPos(CX), BX
   348  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   349  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   350  	// Record that this stack barrier was hit.
   351  	ADDQ	$1, g_stkbarPos(CX)
   352  	// Jump to the original return PC.
   353  	JMP	BX
   354  
   355  // reflectcall: call a function with the given argument list
   356  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   357  // we don't have variable-sized frames, so we use a small number
   358  // of constant-sized-frame functions to encode a few bits of size in the pc.
   359  // Caution: ugly multiline assembly macros in your future!
   360  
   361  #define DISPATCH(NAME,MAXSIZE)		\
   362  	CMPQ	CX, $MAXSIZE;		\
   363  	JA	3(PC);			\
   364  	MOVQ	$NAME(SB), AX;		\
   365  	JMP	AX
   366  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   367  
   368  TEXT reflect·call(SB), NOSPLIT, $0-0
   369  	JMP	·reflectcall(SB)
   370  
   371  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   372  	MOVLQZX argsize+24(FP), CX
   373  	// NOTE(rsc): No call16, because CALLFN needs four words
   374  	// of argument space to invoke callwritebarrier.
   375  	DISPATCH(runtime·call32, 32)
   376  	DISPATCH(runtime·call64, 64)
   377  	DISPATCH(runtime·call128, 128)
   378  	DISPATCH(runtime·call256, 256)
   379  	DISPATCH(runtime·call512, 512)
   380  	DISPATCH(runtime·call1024, 1024)
   381  	DISPATCH(runtime·call2048, 2048)
   382  	DISPATCH(runtime·call4096, 4096)
   383  	DISPATCH(runtime·call8192, 8192)
   384  	DISPATCH(runtime·call16384, 16384)
   385  	DISPATCH(runtime·call32768, 32768)
   386  	DISPATCH(runtime·call65536, 65536)
   387  	DISPATCH(runtime·call131072, 131072)
   388  	DISPATCH(runtime·call262144, 262144)
   389  	DISPATCH(runtime·call524288, 524288)
   390  	DISPATCH(runtime·call1048576, 1048576)
   391  	DISPATCH(runtime·call2097152, 2097152)
   392  	DISPATCH(runtime·call4194304, 4194304)
   393  	DISPATCH(runtime·call8388608, 8388608)
   394  	DISPATCH(runtime·call16777216, 16777216)
   395  	DISPATCH(runtime·call33554432, 33554432)
   396  	DISPATCH(runtime·call67108864, 67108864)
   397  	DISPATCH(runtime·call134217728, 134217728)
   398  	DISPATCH(runtime·call268435456, 268435456)
   399  	DISPATCH(runtime·call536870912, 536870912)
   400  	DISPATCH(runtime·call1073741824, 1073741824)
   401  	MOVQ	$runtime·badreflectcall(SB), AX
   402  	JMP	AX
   403  
   404  #define CALLFN(NAME,MAXSIZE)			\
   405  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   406  	NO_LOCAL_POINTERS;			\
   407  	/* copy arguments to stack */		\
   408  	MOVQ	argptr+16(FP), SI;		\
   409  	MOVLQZX argsize+24(FP), CX;		\
   410  	MOVQ	SP, DI;				\
   411  	REP;MOVSB;				\
   412  	/* call function */			\
   413  	MOVQ	f+8(FP), DX;			\
   414  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   415  	CALL	(DX);				\
   416  	/* copy return values back */		\
   417  	MOVQ	argptr+16(FP), DI;		\
   418  	MOVLQZX	argsize+24(FP), CX;		\
   419  	MOVLQZX retoffset+28(FP), BX;		\
   420  	MOVQ	SP, SI;				\
   421  	ADDQ	BX, DI;				\
   422  	ADDQ	BX, SI;				\
   423  	SUBQ	BX, CX;				\
   424  	REP;MOVSB;				\
   425  	/* execute write barrier updates */	\
   426  	MOVQ	argtype+0(FP), DX;		\
   427  	MOVQ	argptr+16(FP), DI;		\
   428  	MOVLQZX	argsize+24(FP), CX;		\
   429  	MOVLQZX retoffset+28(FP), BX;		\
   430  	MOVQ	DX, 0(SP);			\
   431  	MOVQ	DI, 8(SP);			\
   432  	MOVQ	CX, 16(SP);			\
   433  	MOVQ	BX, 24(SP);			\
   434  	CALL	runtime·callwritebarrier(SB);	\
   435  	RET
   436  
   437  CALLFN(·call32, 32)
   438  CALLFN(·call64, 64)
   439  CALLFN(·call128, 128)
   440  CALLFN(·call256, 256)
   441  CALLFN(·call512, 512)
   442  CALLFN(·call1024, 1024)
   443  CALLFN(·call2048, 2048)
   444  CALLFN(·call4096, 4096)
   445  CALLFN(·call8192, 8192)
   446  CALLFN(·call16384, 16384)
   447  CALLFN(·call32768, 32768)
   448  CALLFN(·call65536, 65536)
   449  CALLFN(·call131072, 131072)
   450  CALLFN(·call262144, 262144)
   451  CALLFN(·call524288, 524288)
   452  CALLFN(·call1048576, 1048576)
   453  CALLFN(·call2097152, 2097152)
   454  CALLFN(·call4194304, 4194304)
   455  CALLFN(·call8388608, 8388608)
   456  CALLFN(·call16777216, 16777216)
   457  CALLFN(·call33554432, 33554432)
   458  CALLFN(·call67108864, 67108864)
   459  CALLFN(·call134217728, 134217728)
   460  CALLFN(·call268435456, 268435456)
   461  CALLFN(·call536870912, 536870912)
   462  CALLFN(·call1073741824, 1073741824)
   463  
   464  // bool cas(int32 *val, int32 old, int32 new)
   465  // Atomically:
   466  //	if(*val == old){
   467  //		*val = new;
   468  //		return 1;
   469  //	} else
   470  //		return 0;
   471  TEXT runtime·cas(SB), NOSPLIT, $0-17
   472  	MOVQ	ptr+0(FP), BX
   473  	MOVL	old+8(FP), AX
   474  	MOVL	new+12(FP), CX
   475  	LOCK
   476  	CMPXCHGL	CX, 0(BX)
   477  	SETEQ	ret+16(FP)
   478  	RET
   479  
   480  // bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
   481  // Atomically:
   482  //	if(*val == *old){
   483  //		*val = new;
   484  //		return 1;
   485  //	} else {
   486  //		return 0;
   487  //	}
   488  TEXT runtime·cas64(SB), NOSPLIT, $0-25
   489  	MOVQ	ptr+0(FP), BX
   490  	MOVQ	old+8(FP), AX
   491  	MOVQ	new+16(FP), CX
   492  	LOCK
   493  	CMPXCHGQ	CX, 0(BX)
   494  	SETEQ	ret+24(FP)
   495  	RET
   496  	
   497  TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
   498  	JMP	runtime·cas64(SB)
   499  
   500  TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16
   501  	JMP	runtime·atomicload64(SB)
   502  
   503  TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16
   504  	JMP	runtime·atomicload64(SB)
   505  
   506  TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
   507  	JMP	runtime·atomicstore64(SB)
   508  
   509  // bool casp(void **val, void *old, void *new)
   510  // Atomically:
   511  //	if(*val == old){
   512  //		*val = new;
   513  //		return 1;
   514  //	} else
   515  //		return 0;
   516  TEXT runtime·casp1(SB), NOSPLIT, $0-25
   517  	MOVQ	ptr+0(FP), BX
   518  	MOVQ	old+8(FP), AX
   519  	MOVQ	new+16(FP), CX
   520  	LOCK
   521  	CMPXCHGQ	CX, 0(BX)
   522  	SETEQ	ret+24(FP)
   523  	RET
   524  
   525  // uint32 xadd(uint32 volatile *val, int32 delta)
   526  // Atomically:
   527  //	*val += delta;
   528  //	return *val;
   529  TEXT runtime·xadd(SB), NOSPLIT, $0-20
   530  	MOVQ	ptr+0(FP), BX
   531  	MOVL	delta+8(FP), AX
   532  	MOVL	AX, CX
   533  	LOCK
   534  	XADDL	AX, 0(BX)
   535  	ADDL	CX, AX
   536  	MOVL	AX, ret+16(FP)
   537  	RET
   538  
   539  TEXT runtime·xadd64(SB), NOSPLIT, $0-24
   540  	MOVQ	ptr+0(FP), BX
   541  	MOVQ	delta+8(FP), AX
   542  	MOVQ	AX, CX
   543  	LOCK
   544  	XADDQ	AX, 0(BX)
   545  	ADDQ	CX, AX
   546  	MOVQ	AX, ret+16(FP)
   547  	RET
   548  
   549  TEXT runtime·xadduintptr(SB), NOSPLIT, $0-24
   550  	JMP	runtime·xadd64(SB)
   551  
   552  TEXT runtime·xchg(SB), NOSPLIT, $0-20
   553  	MOVQ	ptr+0(FP), BX
   554  	MOVL	new+8(FP), AX
   555  	XCHGL	AX, 0(BX)
   556  	MOVL	AX, ret+16(FP)
   557  	RET
   558  
   559  TEXT runtime·xchg64(SB), NOSPLIT, $0-24
   560  	MOVQ	ptr+0(FP), BX
   561  	MOVQ	new+8(FP), AX
   562  	XCHGQ	AX, 0(BX)
   563  	MOVQ	AX, ret+16(FP)
   564  	RET
   565  
   566  TEXT runtime·xchgp1(SB), NOSPLIT, $0-24
   567  	MOVQ	ptr+0(FP), BX
   568  	MOVQ	new+8(FP), AX
   569  	XCHGQ	AX, 0(BX)
   570  	MOVQ	AX, ret+16(FP)
   571  	RET
   572  
   573  TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
   574  	JMP	runtime·xchg64(SB)
   575  
   576  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   577  	MOVL	cycles+0(FP), AX
   578  again:
   579  	PAUSE
   580  	SUBL	$1, AX
   581  	JNZ	again
   582  	RET
   583  
   584  TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
   585  	MOVQ	ptr+0(FP), BX
   586  	MOVQ	val+8(FP), AX
   587  	XCHGQ	AX, 0(BX)
   588  	RET
   589  
   590  TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
   591  	MOVQ	ptr+0(FP), BX
   592  	MOVL	val+8(FP), AX
   593  	XCHGL	AX, 0(BX)
   594  	RET
   595  
   596  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
   597  	MOVQ	ptr+0(FP), BX
   598  	MOVQ	val+8(FP), AX
   599  	XCHGQ	AX, 0(BX)
   600  	RET
   601  
   602  // void	runtime·atomicor8(byte volatile*, byte);
   603  TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
   604  	MOVQ	ptr+0(FP), AX
   605  	MOVB	val+8(FP), BX
   606  	LOCK
   607  	ORB	BX, (AX)
   608  	RET
   609  
   610  // void	runtime·atomicand8(byte volatile*, byte);
   611  TEXT runtime·atomicand8(SB), NOSPLIT, $0-9
   612  	MOVQ	ptr+0(FP), AX
   613  	MOVB	val+8(FP), BX
   614  	LOCK
   615  	ANDB	BX, (AX)
   616  	RET
   617  
   618  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   619  	// Stores are already ordered on x86, so this is just a
   620  	// compile barrier.
   621  	RET
   622  
   623  // void jmpdefer(fn, sp);
   624  // called from deferreturn.
   625  // 1. pop the caller
   626  // 2. sub 5 bytes from the callers return
   627  // 3. jmp to the argument
   628  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   629  	MOVQ	fv+0(FP), DX	// fn
   630  	MOVQ	argp+8(FP), BX	// caller sp
   631  	LEAQ	-8(BX), SP	// caller sp after CALL
   632  	SUBQ	$5, (SP)	// return to CALL again
   633  	MOVQ	0(DX), BX
   634  	JMP	BX	// but first run the deferred function
   635  
   636  // Save state of caller into g->sched. Smashes R8, R9.
   637  TEXT gosave<>(SB),NOSPLIT,$0
   638  	get_tls(R8)
   639  	MOVQ	g(R8), R8
   640  	MOVQ	0(SP), R9
   641  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   642  	LEAQ	8(SP), R9
   643  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   644  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   645  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   646  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   647  	RET
   648  
   649  // func asmcgocall(fn, arg unsafe.Pointer) int32
   650  // Call fn(arg) on the scheduler stack,
   651  // aligned appropriately for the gcc ABI.
   652  // See cgocall.go for more details.
   653  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   654  	MOVQ	fn+0(FP), AX
   655  	MOVQ	arg+8(FP), BX
   656  
   657  	MOVQ	SP, DX
   658  
   659  	// Figure out if we need to switch to m->g0 stack.
   660  	// We get called to create new OS threads too, and those
   661  	// come in on the m->g0 stack already.
   662  	get_tls(CX)
   663  	MOVQ	g(CX), R8
   664  	MOVQ	g_m(R8), R8
   665  	MOVQ	m_g0(R8), SI
   666  	MOVQ	g(CX), DI
   667  	CMPQ	SI, DI
   668  	JEQ	nosave
   669  	MOVQ	m_gsignal(R8), SI
   670  	CMPQ	SI, DI
   671  	JEQ	nosave
   672  	
   673  	MOVQ	m_g0(R8), SI
   674  	CALL	gosave<>(SB)
   675  	MOVQ	SI, g(CX)
   676  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   677  nosave:
   678  
   679  	// Now on a scheduling stack (a pthread-created stack).
   680  	// Make sure we have enough room for 4 stack-backed fast-call
   681  	// registers as per windows amd64 calling convention.
   682  	SUBQ	$64, SP
   683  	ANDQ	$~15, SP	// alignment for gcc ABI
   684  	MOVQ	DI, 48(SP)	// save g
   685  	MOVQ	(g_stack+stack_hi)(DI), DI
   686  	SUBQ	DX, DI
   687  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   688  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   689  	MOVQ	BX, CX		// CX = first argument in Win64
   690  	CALL	AX
   691  
   692  	// Restore registers, g, stack pointer.
   693  	get_tls(CX)
   694  	MOVQ	48(SP), DI
   695  	MOVQ	(g_stack+stack_hi)(DI), SI
   696  	SUBQ	40(SP), SI
   697  	MOVQ	DI, g(CX)
   698  	MOVQ	SI, SP
   699  
   700  	MOVL	AX, ret+16(FP)
   701  	RET
   702  
   703  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   704  // Turn the fn into a Go func (by taking its address) and call
   705  // cgocallback_gofunc.
   706  TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
   707  	LEAQ	fn+0(FP), AX
   708  	MOVQ	AX, 0(SP)
   709  	MOVQ	frame+8(FP), AX
   710  	MOVQ	AX, 8(SP)
   711  	MOVQ	framesize+16(FP), AX
   712  	MOVQ	AX, 16(SP)
   713  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   714  	CALL	AX
   715  	RET
   716  
   717  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   718  // See cgocall.go for more details.
   719  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
   720  	NO_LOCAL_POINTERS
   721  
   722  	// If g is nil, Go did not create the current thread.
   723  	// Call needm to obtain one m for temporary use.
   724  	// In this case, we're running on the thread stack, so there's
   725  	// lots of space, but the linker doesn't know. Hide the call from
   726  	// the linker analysis by using an indirect call through AX.
   727  	get_tls(CX)
   728  #ifdef GOOS_windows
   729  	MOVL	$0, BX
   730  	CMPQ	CX, $0
   731  	JEQ	2(PC)
   732  #endif
   733  	MOVQ	g(CX), BX
   734  	CMPQ	BX, $0
   735  	JEQ	needm
   736  	MOVQ	g_m(BX), BX
   737  	MOVQ	BX, R8 // holds oldm until end of function
   738  	JMP	havem
   739  needm:
   740  	MOVQ	$0, 0(SP)
   741  	MOVQ	$runtime·needm(SB), AX
   742  	CALL	AX
   743  	MOVQ	0(SP), R8
   744  	get_tls(CX)
   745  	MOVQ	g(CX), BX
   746  	MOVQ	g_m(BX), BX
   747  	
   748  	// Set m->sched.sp = SP, so that if a panic happens
   749  	// during the function we are about to execute, it will
   750  	// have a valid SP to run on the g0 stack.
   751  	// The next few lines (after the havem label)
   752  	// will save this SP onto the stack and then write
   753  	// the same SP back to m->sched.sp. That seems redundant,
   754  	// but if an unrecovered panic happens, unwindm will
   755  	// restore the g->sched.sp from the stack location
   756  	// and then systemstack will try to use it. If we don't set it here,
   757  	// that restored SP will be uninitialized (typically 0) and
   758  	// will not be usable.
   759  	MOVQ	m_g0(BX), SI
   760  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   761  
   762  havem:
   763  	// Now there's a valid m, and we're running on its m->g0.
   764  	// Save current m->g0->sched.sp on stack and then set it to SP.
   765  	// Save current sp in m->g0->sched.sp in preparation for
   766  	// switch back to m->curg stack.
   767  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   768  	MOVQ	m_g0(BX), SI
   769  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   770  	MOVQ	AX, 0(SP)
   771  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   772  
   773  	// Switch to m->curg stack and call runtime.cgocallbackg.
   774  	// Because we are taking over the execution of m->curg
   775  	// but *not* resuming what had been running, we need to
   776  	// save that information (m->curg->sched) so we can restore it.
   777  	// We can restore m->curg->sched.sp easily, because calling
   778  	// runtime.cgocallbackg leaves SP unchanged upon return.
   779  	// To save m->curg->sched.pc, we push it onto the stack.
   780  	// This has the added benefit that it looks to the traceback
   781  	// routine like cgocallbackg is going to return to that
   782  	// PC (because the frame we allocate below has the same
   783  	// size as cgocallback_gofunc's frame declared above)
   784  	// so that the traceback will seamlessly trace back into
   785  	// the earlier calls.
   786  	//
   787  	// In the new goroutine, 0(SP) holds the saved R8.
   788  	MOVQ	m_curg(BX), SI
   789  	MOVQ	SI, g(CX)
   790  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   791  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   792  	MOVQ	BX, -8(DI)
   793  	// Compute the size of the frame, including return PC and, if
   794  	// GOEXPERIMENT=framepointer, the saved based pointer
   795  	LEAQ	fv+0(FP), AX
   796  	SUBQ	SP, AX
   797  	SUBQ	AX, DI
   798  	MOVQ	DI, SP
   799  
   800  	MOVQ	R8, 0(SP)
   801  	CALL	runtime·cgocallbackg(SB)
   802  	MOVQ	0(SP), R8
   803  
   804  	// Compute the size of the frame again.  FP and SP have
   805  	// completely different values here than they did above,
   806  	// but only their difference matters.
   807  	LEAQ	fv+0(FP), AX
   808  	SUBQ	SP, AX
   809  
   810  	// Restore g->sched (== m->curg->sched) from saved values.
   811  	get_tls(CX)
   812  	MOVQ	g(CX), SI
   813  	MOVQ	SP, DI
   814  	ADDQ	AX, DI
   815  	MOVQ	-8(DI), BX
   816  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   817  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   818  
   819  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   820  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   821  	// so we do not have to restore it.)
   822  	MOVQ	g(CX), BX
   823  	MOVQ	g_m(BX), BX
   824  	MOVQ	m_g0(BX), SI
   825  	MOVQ	SI, g(CX)
   826  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   827  	MOVQ	0(SP), AX
   828  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   829  	
   830  	// If the m on entry was nil, we called needm above to borrow an m
   831  	// for the duration of the call. Since the call is over, return it with dropm.
   832  	CMPQ	R8, $0
   833  	JNE 3(PC)
   834  	MOVQ	$runtime·dropm(SB), AX
   835  	CALL	AX
   836  
   837  	// Done!
   838  	RET
   839  
   840  // void setg(G*); set g. for use by needm.
   841  TEXT runtime·setg(SB), NOSPLIT, $0-8
   842  	MOVQ	gg+0(FP), BX
   843  #ifdef GOOS_windows
   844  	CMPQ	BX, $0
   845  	JNE	settls
   846  	MOVQ	$0, 0x28(GS)
   847  	RET
   848  settls:
   849  	MOVQ	g_m(BX), AX
   850  	LEAQ	m_tls(AX), AX
   851  	MOVQ	AX, 0x28(GS)
   852  #endif
   853  	get_tls(CX)
   854  	MOVQ	BX, g(CX)
   855  	RET
   856  
   857  // void setg_gcc(G*); set g called from gcc.
   858  TEXT setg_gcc<>(SB),NOSPLIT,$0
   859  	get_tls(AX)
   860  	MOVQ	DI, g(AX)
   861  	RET
   862  
   863  // check that SP is in range [g->stack.lo, g->stack.hi)
   864  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   865  	get_tls(CX)
   866  	MOVQ	g(CX), AX
   867  	CMPQ	(g_stack+stack_hi)(AX), SP
   868  	JHI	2(PC)
   869  	INT	$3
   870  	CMPQ	SP, (g_stack+stack_lo)(AX)
   871  	JHI	2(PC)
   872  	INT	$3
   873  	RET
   874  
   875  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   876  	MOVQ	argp+0(FP),AX		// addr of first arg
   877  	MOVQ	-8(AX),AX		// get calling pc
   878  	CMPQ	AX, runtime·stackBarrierPC(SB)
   879  	JNE	nobar
   880  	// Get original return PC.
   881  	CALL	runtime·nextBarrierPC(SB)
   882  	MOVQ	0(SP), AX
   883  nobar:
   884  	MOVQ	AX, ret+8(FP)
   885  	RET
   886  
   887  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   888  	MOVQ	argp+0(FP),AX		// addr of first arg
   889  	MOVQ	pc+8(FP), BX
   890  	MOVQ	-8(AX), CX
   891  	CMPQ	CX, runtime·stackBarrierPC(SB)
   892  	JEQ	setbar
   893  	MOVQ	BX, -8(AX)		// set calling pc
   894  	RET
   895  setbar:
   896  	// Set the stack barrier return PC.
   897  	MOVQ	BX, 0(SP)
   898  	CALL	runtime·setNextBarrierPC(SB)
   899  	RET
   900  
   901  TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   902  	MOVQ	argp+0(FP), AX
   903  	MOVQ	AX, ret+8(FP)
   904  	RET
   905  
   906  // func cputicks() int64
   907  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   908  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   909  	JNE	mfence
   910  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   911  	JMP	done
   912  mfence:
   913  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   914  done:
   915  	RDTSC
   916  	SHLQ	$32, DX
   917  	ADDQ	DX, AX
   918  	MOVQ	AX, ret+0(FP)
   919  	RET
   920  
   921  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   922  // redirects to memhash(p, h, size) using the size
   923  // stored in the closure.
   924  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   925  	GO_ARGS
   926  	NO_LOCAL_POINTERS
   927  	MOVQ	p+0(FP), AX
   928  	MOVQ	h+8(FP), BX
   929  	MOVQ	8(DX), CX
   930  	MOVQ	AX, 0(SP)
   931  	MOVQ	BX, 8(SP)
   932  	MOVQ	CX, 16(SP)
   933  	CALL	runtime·memhash(SB)
   934  	MOVQ	24(SP), AX
   935  	MOVQ	AX, ret+16(FP)
   936  	RET
   937  
   938  // hash function using AES hardware instructions
   939  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   940  	MOVQ	p+0(FP), AX	// ptr to data
   941  	MOVQ	s+16(FP), CX	// size
   942  	LEAQ	ret+24(FP), DX
   943  	JMP	runtime·aeshashbody(SB)
   944  
   945  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   946  	MOVQ	p+0(FP), AX	// ptr to string struct
   947  	MOVQ	8(AX), CX	// length of string
   948  	MOVQ	(AX), AX	// string data
   949  	LEAQ	ret+16(FP), DX
   950  	JMP	runtime·aeshashbody(SB)
   951  
   952  // AX: data
   953  // CX: length
   954  // DX: address to put return value
   955  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   956  	MOVQ	h+8(FP), X6	// seed to low 64 bits of xmm6
   957  	PINSRQ	$1, CX, X6	// size to high 64 bits of xmm6
   958  	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
   959  	MOVO	runtime·aeskeysched(SB), X7
   960  	CMPQ	CX, $16
   961  	JB	aes0to15
   962  	JE	aes16
   963  	CMPQ	CX, $32
   964  	JBE	aes17to32
   965  	CMPQ	CX, $64
   966  	JBE	aes33to64
   967  	CMPQ	CX, $128
   968  	JBE	aes65to128
   969  	JMP	aes129plus
   970  
   971  aes0to15:
   972  	TESTQ	CX, CX
   973  	JE	aes0
   974  
   975  	ADDQ	$16, AX
   976  	TESTW	$0xff0, AX
   977  	JE	endofpage
   978  
   979  	// 16 bytes loaded at this address won't cross
   980  	// a page boundary, so we can load it directly.
   981  	MOVOU	-16(AX), X0
   982  	ADDQ	CX, CX
   983  	MOVQ	$masks<>(SB), AX
   984  	PAND	(AX)(CX*8), X0
   985  
   986  	// scramble 3 times
   987  	AESENC	X6, X0
   988  	AESENC	X7, X0
   989  	AESENC	X7, X0
   990  	MOVQ	X0, (DX)
   991  	RET
   992  
   993  endofpage:
   994  	// address ends in 1111xxxx.  Might be up against
   995  	// a page boundary, so load ending at last byte.
   996  	// Then shift bytes down using pshufb.
   997  	MOVOU	-32(AX)(CX*1), X0
   998  	ADDQ	CX, CX
   999  	MOVQ	$shifts<>(SB), AX
  1000  	PSHUFB	(AX)(CX*8), X0
  1001  	AESENC	X6, X0
  1002  	AESENC	X7, X0
  1003  	AESENC	X7, X0
  1004  	MOVQ	X0, (DX)
  1005  	RET
  1006  
  1007  aes0:
  1008  	// return input seed
  1009  	MOVQ	h+8(FP), AX
  1010  	MOVQ	AX, (DX)
  1011  	RET
  1012  
  1013  aes16:
  1014  	MOVOU	(AX), X0
  1015  	AESENC	X6, X0
  1016  	AESENC	X7, X0
  1017  	AESENC	X7, X0
  1018  	MOVQ	X0, (DX)
  1019  	RET
  1020  
  1021  aes17to32:
  1022  	// load data to be hashed
  1023  	MOVOU	(AX), X0
  1024  	MOVOU	-16(AX)(CX*1), X1
  1025  
  1026  	// scramble 3 times
  1027  	AESENC	X6, X0
  1028  	AESENC	runtime·aeskeysched+16(SB), X1
  1029  	AESENC	X7, X0
  1030  	AESENC	X7, X1
  1031  	AESENC	X7, X0
  1032  	AESENC	X7, X1
  1033  
  1034  	// combine results
  1035  	PXOR	X1, X0
  1036  	MOVQ	X0, (DX)
  1037  	RET
  1038  
  1039  aes33to64:
  1040  	MOVOU	(AX), X0
  1041  	MOVOU	16(AX), X1
  1042  	MOVOU	-32(AX)(CX*1), X2
  1043  	MOVOU	-16(AX)(CX*1), X3
  1044  	
  1045  	AESENC	X6, X0
  1046  	AESENC	runtime·aeskeysched+16(SB), X1
  1047  	AESENC	runtime·aeskeysched+32(SB), X2
  1048  	AESENC	runtime·aeskeysched+48(SB), X3
  1049  	AESENC	X7, X0
  1050  	AESENC	X7, X1
  1051  	AESENC	X7, X2
  1052  	AESENC	X7, X3
  1053  	AESENC	X7, X0
  1054  	AESENC	X7, X1
  1055  	AESENC	X7, X2
  1056  	AESENC	X7, X3
  1057  
  1058  	PXOR	X2, X0
  1059  	PXOR	X3, X1
  1060  	PXOR	X1, X0
  1061  	MOVQ	X0, (DX)
  1062  	RET
  1063  
  1064  aes65to128:
  1065  	MOVOU	(AX), X0
  1066  	MOVOU	16(AX), X1
  1067  	MOVOU	32(AX), X2
  1068  	MOVOU	48(AX), X3
  1069  	MOVOU	-64(AX)(CX*1), X4
  1070  	MOVOU	-48(AX)(CX*1), X5
  1071  	MOVOU	-32(AX)(CX*1), X8
  1072  	MOVOU	-16(AX)(CX*1), X9
  1073  	
  1074  	AESENC	X6, X0
  1075  	AESENC	runtime·aeskeysched+16(SB), X1
  1076  	AESENC	runtime·aeskeysched+32(SB), X2
  1077  	AESENC	runtime·aeskeysched+48(SB), X3
  1078  	AESENC	runtime·aeskeysched+64(SB), X4
  1079  	AESENC	runtime·aeskeysched+80(SB), X5
  1080  	AESENC	runtime·aeskeysched+96(SB), X8
  1081  	AESENC	runtime·aeskeysched+112(SB), X9
  1082  	AESENC	X7, X0
  1083  	AESENC	X7, X1
  1084  	AESENC	X7, X2
  1085  	AESENC	X7, X3
  1086  	AESENC	X7, X4
  1087  	AESENC	X7, X5
  1088  	AESENC	X7, X8
  1089  	AESENC	X7, X9
  1090  	AESENC	X7, X0
  1091  	AESENC	X7, X1
  1092  	AESENC	X7, X2
  1093  	AESENC	X7, X3
  1094  	AESENC	X7, X4
  1095  	AESENC	X7, X5
  1096  	AESENC	X7, X8
  1097  	AESENC	X7, X9
  1098  
  1099  	PXOR	X4, X0
  1100  	PXOR	X5, X1
  1101  	PXOR	X8, X2
  1102  	PXOR	X9, X3
  1103  	PXOR	X2, X0
  1104  	PXOR	X3, X1
  1105  	PXOR	X1, X0
  1106  	MOVQ	X0, (DX)
  1107  	RET
  1108  
  1109  aes129plus:
  1110  	// start with last (possibly overlapping) block
  1111  	MOVOU	-128(AX)(CX*1), X0
  1112  	MOVOU	-112(AX)(CX*1), X1
  1113  	MOVOU	-96(AX)(CX*1), X2
  1114  	MOVOU	-80(AX)(CX*1), X3
  1115  	MOVOU	-64(AX)(CX*1), X4
  1116  	MOVOU	-48(AX)(CX*1), X5
  1117  	MOVOU	-32(AX)(CX*1), X8
  1118  	MOVOU	-16(AX)(CX*1), X9
  1119  
  1120  	// scramble state once
  1121  	AESENC	X6, X0
  1122  	AESENC	runtime·aeskeysched+16(SB), X1
  1123  	AESENC	runtime·aeskeysched+32(SB), X2
  1124  	AESENC	runtime·aeskeysched+48(SB), X3
  1125  	AESENC	runtime·aeskeysched+64(SB), X4
  1126  	AESENC	runtime·aeskeysched+80(SB), X5
  1127  	AESENC	runtime·aeskeysched+96(SB), X8
  1128  	AESENC	runtime·aeskeysched+112(SB), X9
  1129  
  1130  	// compute number of remaining 128-byte blocks
  1131  	DECQ	CX
  1132  	SHRQ	$7, CX
  1133  	
  1134  aesloop:
  1135  	// scramble state, xor in a block
  1136  	MOVOU	(AX), X10
  1137  	MOVOU	16(AX), X11
  1138  	MOVOU	32(AX), X12
  1139  	MOVOU	48(AX), X13
  1140  	AESENC	X10, X0
  1141  	AESENC	X11, X1
  1142  	AESENC	X12, X2
  1143  	AESENC	X13, X3
  1144  	MOVOU	64(AX), X10
  1145  	MOVOU	80(AX), X11
  1146  	MOVOU	96(AX), X12
  1147  	MOVOU	112(AX), X13
  1148  	AESENC	X10, X4
  1149  	AESENC	X11, X5
  1150  	AESENC	X12, X8
  1151  	AESENC	X13, X9
  1152  
  1153  	// scramble state
  1154  	AESENC	X7, X0
  1155  	AESENC	X7, X1
  1156  	AESENC	X7, X2
  1157  	AESENC	X7, X3
  1158  	AESENC	X7, X4
  1159  	AESENC	X7, X5
  1160  	AESENC	X7, X8
  1161  	AESENC	X7, X9
  1162  
  1163  	ADDQ	$128, AX
  1164  	DECQ	CX
  1165  	JNE	aesloop
  1166  
  1167  	// 2 more scrambles to finish
  1168  	AESENC	X7, X0
  1169  	AESENC	X7, X1
  1170  	AESENC	X7, X2
  1171  	AESENC	X7, X3
  1172  	AESENC	X7, X4
  1173  	AESENC	X7, X5
  1174  	AESENC	X7, X8
  1175  	AESENC	X7, X9
  1176  	AESENC	X7, X0
  1177  	AESENC	X7, X1
  1178  	AESENC	X7, X2
  1179  	AESENC	X7, X3
  1180  	AESENC	X7, X4
  1181  	AESENC	X7, X5
  1182  	AESENC	X7, X8
  1183  	AESENC	X7, X9
  1184  
  1185  	PXOR	X4, X0
  1186  	PXOR	X5, X1
  1187  	PXOR	X8, X2
  1188  	PXOR	X9, X3
  1189  	PXOR	X2, X0
  1190  	PXOR	X3, X1
  1191  	PXOR	X1, X0
  1192  	MOVQ	X0, (DX)
  1193  	RET
  1194  	
  1195  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1196  	MOVQ	p+0(FP), AX	// ptr to data
  1197  	MOVQ	h+8(FP), X0	// seed
  1198  	PINSRD	$2, (AX), X0	// data
  1199  	AESENC	runtime·aeskeysched+0(SB), X0
  1200  	AESENC	runtime·aeskeysched+16(SB), X0
  1201  	AESENC	runtime·aeskeysched+32(SB), X0
  1202  	MOVQ	X0, ret+16(FP)
  1203  	RET
  1204  
  1205  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1206  	MOVQ	p+0(FP), AX	// ptr to data
  1207  	MOVQ	h+8(FP), X0	// seed
  1208  	PINSRQ	$1, (AX), X0	// data
  1209  	AESENC	runtime·aeskeysched+0(SB), X0
  1210  	AESENC	runtime·aeskeysched+16(SB), X0
  1211  	AESENC	runtime·aeskeysched+32(SB), X0
  1212  	MOVQ	X0, ret+16(FP)
  1213  	RET
  1214  
  1215  // simple mask to get rid of data in the high part of the register.
  1216  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1217  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1218  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1219  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1220  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1221  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1222  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1223  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1224  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1225  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1226  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1227  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1228  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1229  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1230  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1231  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1232  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1233  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1234  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1235  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1236  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1237  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1238  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1239  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1240  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1241  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1242  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1243  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1244  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1245  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1246  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1247  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1248  GLOBL masks<>(SB),RODATA,$256
  1249  
  1250  // these are arguments to pshufb.  They move data down from
  1251  // the high bytes of the register to the low bytes of the register.
  1252  // index is how many bytes to move.
  1253  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1254  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1255  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1256  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1257  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1258  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1259  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1260  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1261  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1262  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1263  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1264  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1265  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1266  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1267  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1268  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1269  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1270  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1271  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1272  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1273  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1274  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1275  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1276  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1277  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1278  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1279  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1280  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1281  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1282  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1283  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1284  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1285  GLOBL shifts<>(SB),RODATA,$256
  1286  
  1287  TEXT runtime·memeq(SB),NOSPLIT,$0-25
  1288  	MOVQ	a+0(FP), SI
  1289  	MOVQ	b+8(FP), DI
  1290  	MOVQ	size+16(FP), BX
  1291  	LEAQ	ret+24(FP), AX
  1292  	JMP	runtime·memeqbody(SB)
  1293  
  1294  // memequal_varlen(a, b unsafe.Pointer) bool
  1295  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1296  	MOVQ	a+0(FP), SI
  1297  	MOVQ	b+8(FP), DI
  1298  	CMPQ	SI, DI
  1299  	JEQ	eq
  1300  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1301  	LEAQ	ret+16(FP), AX
  1302  	JMP	runtime·memeqbody(SB)
  1303  eq:
  1304  	MOVB	$1, ret+16(FP)
  1305  	RET
  1306  
  1307  // eqstring tests whether two strings are equal.
  1308  // The compiler guarantees that strings passed
  1309  // to eqstring have equal length.
  1310  // See runtime_test.go:eqstring_generic for
  1311  // equivalent Go code.
  1312  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1313  	MOVQ	s1str+0(FP), SI
  1314  	MOVQ	s2str+16(FP), DI
  1315  	CMPQ	SI, DI
  1316  	JEQ	eq
  1317  	MOVQ	s1len+8(FP), BX
  1318  	LEAQ	v+32(FP), AX
  1319  	JMP	runtime·memeqbody(SB)
  1320  eq:
  1321  	MOVB	$1, v+32(FP)
  1322  	RET
  1323  
  1324  // a in SI
  1325  // b in DI
  1326  // count in BX
  1327  // address of result byte in AX
  1328  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1329  	CMPQ	BX, $8
  1330  	JB	small
  1331  	
  1332  	// 64 bytes at a time using xmm registers
  1333  hugeloop:
  1334  	CMPQ	BX, $64
  1335  	JB	bigloop
  1336  	MOVOU	(SI), X0
  1337  	MOVOU	(DI), X1
  1338  	MOVOU	16(SI), X2
  1339  	MOVOU	16(DI), X3
  1340  	MOVOU	32(SI), X4
  1341  	MOVOU	32(DI), X5
  1342  	MOVOU	48(SI), X6
  1343  	MOVOU	48(DI), X7
  1344  	PCMPEQB	X1, X0
  1345  	PCMPEQB	X3, X2
  1346  	PCMPEQB	X5, X4
  1347  	PCMPEQB	X7, X6
  1348  	PAND	X2, X0
  1349  	PAND	X6, X4
  1350  	PAND	X4, X0
  1351  	PMOVMSKB X0, DX
  1352  	ADDQ	$64, SI
  1353  	ADDQ	$64, DI
  1354  	SUBQ	$64, BX
  1355  	CMPL	DX, $0xffff
  1356  	JEQ	hugeloop
  1357  	MOVB	$0, (AX)
  1358  	RET
  1359  
  1360  	// 8 bytes at a time using 64-bit register
  1361  bigloop:
  1362  	CMPQ	BX, $8
  1363  	JBE	leftover
  1364  	MOVQ	(SI), CX
  1365  	MOVQ	(DI), DX
  1366  	ADDQ	$8, SI
  1367  	ADDQ	$8, DI
  1368  	SUBQ	$8, BX
  1369  	CMPQ	CX, DX
  1370  	JEQ	bigloop
  1371  	MOVB	$0, (AX)
  1372  	RET
  1373  
  1374  	// remaining 0-8 bytes
  1375  leftover:
  1376  	MOVQ	-8(SI)(BX*1), CX
  1377  	MOVQ	-8(DI)(BX*1), DX
  1378  	CMPQ	CX, DX
  1379  	SETEQ	(AX)
  1380  	RET
  1381  
  1382  small:
  1383  	CMPQ	BX, $0
  1384  	JEQ	equal
  1385  
  1386  	LEAQ	0(BX*8), CX
  1387  	NEGQ	CX
  1388  
  1389  	CMPB	SI, $0xf8
  1390  	JA	si_high
  1391  
  1392  	// load at SI won't cross a page boundary.
  1393  	MOVQ	(SI), SI
  1394  	JMP	si_finish
  1395  si_high:
  1396  	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
  1397  	MOVQ	-8(SI)(BX*1), SI
  1398  	SHRQ	CX, SI
  1399  si_finish:
  1400  
  1401  	// same for DI.
  1402  	CMPB	DI, $0xf8
  1403  	JA	di_high
  1404  	MOVQ	(DI), DI
  1405  	JMP	di_finish
  1406  di_high:
  1407  	MOVQ	-8(DI)(BX*1), DI
  1408  	SHRQ	CX, DI
  1409  di_finish:
  1410  
  1411  	SUBQ	SI, DI
  1412  	SHLQ	CX, DI
  1413  equal:
  1414  	SETEQ	(AX)
  1415  	RET
  1416  
  1417  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1418  	MOVQ	s1_base+0(FP), SI
  1419  	MOVQ	s1_len+8(FP), BX
  1420  	MOVQ	s2_base+16(FP), DI
  1421  	MOVQ	s2_len+24(FP), DX
  1422  	LEAQ	ret+32(FP), R9
  1423  	JMP	runtime·cmpbody(SB)
  1424  
  1425  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1426  	MOVQ	s1+0(FP), SI
  1427  	MOVQ	s1+8(FP), BX
  1428  	MOVQ	s2+24(FP), DI
  1429  	MOVQ	s2+32(FP), DX
  1430  	LEAQ	res+48(FP), R9
  1431  	JMP	runtime·cmpbody(SB)
  1432  
  1433  // input:
  1434  //   SI = a
  1435  //   DI = b
  1436  //   BX = alen
  1437  //   DX = blen
  1438  //   R9 = address of output word (stores -1/0/1 here)
  1439  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1440  	CMPQ	SI, DI
  1441  	JEQ	allsame
  1442  	CMPQ	BX, DX
  1443  	MOVQ	DX, R8
  1444  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1445  	CMPQ	R8, $8
  1446  	JB	small
  1447  
  1448  loop:
  1449  	CMPQ	R8, $16
  1450  	JBE	_0through16
  1451  	MOVOU	(SI), X0
  1452  	MOVOU	(DI), X1
  1453  	PCMPEQB X0, X1
  1454  	PMOVMSKB X1, AX
  1455  	XORQ	$0xffff, AX	// convert EQ to NE
  1456  	JNE	diff16	// branch if at least one byte is not equal
  1457  	ADDQ	$16, SI
  1458  	ADDQ	$16, DI
  1459  	SUBQ	$16, R8
  1460  	JMP	loop
  1461  	
  1462  	// AX = bit mask of differences
  1463  diff16:
  1464  	BSFQ	AX, BX	// index of first byte that differs
  1465  	XORQ	AX, AX
  1466  	MOVB	(SI)(BX*1), CX
  1467  	CMPB	CX, (DI)(BX*1)
  1468  	SETHI	AX
  1469  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1470  	MOVQ	AX, (R9)
  1471  	RET
  1472  
  1473  	// 0 through 16 bytes left, alen>=8, blen>=8
  1474  _0through16:
  1475  	CMPQ	R8, $8
  1476  	JBE	_0through8
  1477  	MOVQ	(SI), AX
  1478  	MOVQ	(DI), CX
  1479  	CMPQ	AX, CX
  1480  	JNE	diff8
  1481  _0through8:
  1482  	MOVQ	-8(SI)(R8*1), AX
  1483  	MOVQ	-8(DI)(R8*1), CX
  1484  	CMPQ	AX, CX
  1485  	JEQ	allsame
  1486  
  1487  	// AX and CX contain parts of a and b that differ.
  1488  diff8:
  1489  	BSWAPQ	AX	// reverse order of bytes
  1490  	BSWAPQ	CX
  1491  	XORQ	AX, CX
  1492  	BSRQ	CX, CX	// index of highest bit difference
  1493  	SHRQ	CX, AX	// move a's bit to bottom
  1494  	ANDQ	$1, AX	// mask bit
  1495  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1496  	MOVQ	AX, (R9)
  1497  	RET
  1498  
  1499  	// 0-7 bytes in common
  1500  small:
  1501  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1502  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1503  	JEQ	allsame
  1504  
  1505  	// load bytes of a into high bytes of AX
  1506  	CMPB	SI, $0xf8
  1507  	JA	si_high
  1508  	MOVQ	(SI), SI
  1509  	JMP	si_finish
  1510  si_high:
  1511  	MOVQ	-8(SI)(R8*1), SI
  1512  	SHRQ	CX, SI
  1513  si_finish:
  1514  	SHLQ	CX, SI
  1515  
  1516  	// load bytes of b in to high bytes of BX
  1517  	CMPB	DI, $0xf8
  1518  	JA	di_high
  1519  	MOVQ	(DI), DI
  1520  	JMP	di_finish
  1521  di_high:
  1522  	MOVQ	-8(DI)(R8*1), DI
  1523  	SHRQ	CX, DI
  1524  di_finish:
  1525  	SHLQ	CX, DI
  1526  
  1527  	BSWAPQ	SI	// reverse order of bytes
  1528  	BSWAPQ	DI
  1529  	XORQ	SI, DI	// find bit differences
  1530  	JEQ	allsame
  1531  	BSRQ	DI, CX	// index of highest bit difference
  1532  	SHRQ	CX, SI	// move a's bit to bottom
  1533  	ANDQ	$1, SI	// mask bit
  1534  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1535  	MOVQ	AX, (R9)
  1536  	RET
  1537  
  1538  allsame:
  1539  	XORQ	AX, AX
  1540  	XORQ	CX, CX
  1541  	CMPQ	BX, DX
  1542  	SETGT	AX	// 1 if alen > blen
  1543  	SETEQ	CX	// 1 if alen == blen
  1544  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1545  	MOVQ	AX, (R9)
  1546  	RET
  1547  
  1548  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1549  	MOVQ s+0(FP), SI
  1550  	MOVQ s_len+8(FP), BX
  1551  	MOVB c+24(FP), AL
  1552  	LEAQ ret+32(FP), R8
  1553  	JMP  runtime·indexbytebody(SB)
  1554  
  1555  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1556  	MOVQ s+0(FP), SI
  1557  	MOVQ s_len+8(FP), BX
  1558  	MOVB c+16(FP), AL
  1559  	LEAQ ret+24(FP), R8
  1560  	JMP  runtime·indexbytebody(SB)
  1561  
  1562  // input:
  1563  //   SI: data
  1564  //   BX: data len
  1565  //   AL: byte sought
  1566  //   R8: address to put result
  1567  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1568  	MOVQ SI, DI
  1569  
  1570  	CMPQ BX, $16
  1571  	JLT small
  1572  
  1573  	// round up to first 16-byte boundary
  1574  	TESTQ $15, SI
  1575  	JZ aligned
  1576  	MOVQ SI, CX
  1577  	ANDQ $~15, CX
  1578  	ADDQ $16, CX
  1579  
  1580  	// search the beginning
  1581  	SUBQ SI, CX
  1582  	REPN; SCASB
  1583  	JZ success
  1584  
  1585  // DI is 16-byte aligned; get ready to search using SSE instructions
  1586  aligned:
  1587  	// round down to last 16-byte boundary
  1588  	MOVQ BX, R11
  1589  	ADDQ SI, R11
  1590  	ANDQ $~15, R11
  1591  
  1592  	// shuffle X0 around so that each byte contains c
  1593  	MOVD AX, X0
  1594  	PUNPCKLBW X0, X0
  1595  	PUNPCKLBW X0, X0
  1596  	PSHUFL $0, X0, X0
  1597  	JMP condition
  1598  
  1599  sse:
  1600  	// move the next 16-byte chunk of the buffer into X1
  1601  	MOVO (DI), X1
  1602  	// compare bytes in X0 to X1
  1603  	PCMPEQB X0, X1
  1604  	// take the top bit of each byte in X1 and put the result in DX
  1605  	PMOVMSKB X1, DX
  1606  	TESTL DX, DX
  1607  	JNZ ssesuccess
  1608  	ADDQ $16, DI
  1609  
  1610  condition:
  1611  	CMPQ DI, R11
  1612  	JLT sse
  1613  
  1614  	// search the end
  1615  	MOVQ SI, CX
  1616  	ADDQ BX, CX
  1617  	SUBQ R11, CX
  1618  	// if CX == 0, the zero flag will be set and we'll end up
  1619  	// returning a false success
  1620  	JZ failure
  1621  	REPN; SCASB
  1622  	JZ success
  1623  
  1624  failure:
  1625  	MOVQ $-1, (R8)
  1626  	RET
  1627  
  1628  // handle for lengths < 16
  1629  small:
  1630  	MOVQ BX, CX
  1631  	REPN; SCASB
  1632  	JZ success
  1633  	MOVQ $-1, (R8)
  1634  	RET
  1635  
  1636  // we've found the chunk containing the byte
  1637  // now just figure out which specific byte it is
  1638  ssesuccess:
  1639  	// get the index of the least significant set bit
  1640  	BSFW DX, DX
  1641  	SUBQ SI, DI
  1642  	ADDQ DI, DX
  1643  	MOVQ DX, (R8)
  1644  	RET
  1645  
  1646  success:
  1647  	SUBQ SI, DI
  1648  	SUBL $1, DI
  1649  	MOVQ DI, (R8)
  1650  	RET
  1651  
  1652  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  1653  	MOVQ	a_len+8(FP), BX
  1654  	MOVQ	b_len+32(FP), CX
  1655  	CMPQ	BX, CX
  1656  	JNE	eqret
  1657  	MOVQ	a+0(FP), SI
  1658  	MOVQ	b+24(FP), DI
  1659  	LEAQ	ret+48(FP), AX
  1660  	JMP	runtime·memeqbody(SB)
  1661  eqret:
  1662  	MOVB	$0, ret+48(FP)
  1663  	RET
  1664  
  1665  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1666  	get_tls(CX)
  1667  	MOVQ	g(CX), AX
  1668  	MOVQ	g_m(AX), AX
  1669  	MOVL	m_fastrand(AX), DX
  1670  	ADDL	DX, DX
  1671  	MOVL	DX, BX
  1672  	XORL	$0x88888eef, DX
  1673  	CMOVLMI	BX, DX
  1674  	MOVL	DX, m_fastrand(AX)
  1675  	MOVL	DX, ret+0(FP)
  1676  	RET
  1677  
  1678  TEXT runtime·return0(SB), NOSPLIT, $0
  1679  	MOVL	$0, AX
  1680  	RET
  1681  
  1682  
  1683  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1684  // Must obey the gcc calling convention.
  1685  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1686  	get_tls(CX)
  1687  	MOVQ	g(CX), AX
  1688  	MOVQ	g_m(AX), AX
  1689  	MOVQ	m_curg(AX), AX
  1690  	MOVQ	(g_stack+stack_hi)(AX), AX
  1691  	RET
  1692  
  1693  // The top-most function running on a goroutine
  1694  // returns to goexit+PCQuantum.
  1695  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1696  	BYTE	$0x90	// NOP
  1697  	CALL	runtime·goexit1(SB)	// does not return
  1698  	// traceback from goexit1 must hit code range of goexit
  1699  	BYTE	$0x90	// NOP
  1700  
  1701  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  1702  	MOVQ	addr+0(FP), AX
  1703  	PREFETCHT0	(AX)
  1704  	RET
  1705  
  1706  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  1707  	MOVQ	addr+0(FP), AX
  1708  	PREFETCHT1	(AX)
  1709  	RET
  1710  
  1711  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  1712  	MOVQ	addr+0(FP), AX
  1713  	PREFETCHT2	(AX)
  1714  	RET
  1715  
  1716  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  1717  	MOVQ	addr+0(FP), AX
  1718  	PREFETCHNTA	(AX)
  1719  	RET
  1720  
  1721  // This is called from .init_array and follows the platform, not Go, ABI.
  1722  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1723  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  1724  	MOVQ	runtime·lastmoduledatap(SB), AX
  1725  	MOVQ	DI, moduledata_next(AX)
  1726  	MOVQ	DI, runtime·lastmoduledatap(SB)
  1727  	POPQ	R15
  1728  	RET