github.com/c0deoo1/golang1.5@v0.0.0-20220525150107-c87c805d4593/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto 这里做了对齐
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)          // argc
    17  	MOVQ	BX, 24(SP)          // argv
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX      // g0的堆栈大小大致为64k-104
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI) // 设置g0的堆栈范围
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	CMPQ	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  
    46  	MOVQ	$1, AX
    47  	CPUID
    48  	MOVL	CX, runtime·cpuid_ecx(SB)
    49  	MOVL	DX, runtime·cpuid_edx(SB)
    50  nocpuinfo:	
    51  	
    52  	// if there is an _cgo_init, call it.
    53  	// CGO相关的初始化。 TODO
    54  	MOVQ	_cgo_init(SB), AX
    55  	TESTQ	AX, AX
    56  	JZ	needtls
    57  	// g0 already in DI
    58  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    59  	MOVQ	$setg_gcc<>(SB), SI
    60  	CALL	AX
    61  
    62  	// update stackguard after _cgo_init
    63  	MOVQ	$runtime·g0(SB), CX
    64  	MOVQ	(g_stack+stack_lo)(CX), AX
    65  	ADDQ	$const__StackGuard, AX
    66  	MOVQ	AX, g_stackguard0(CX)
    67  	MOVQ	AX, g_stackguard1(CX)
    68  
    69  	CMPL	runtime·iswindows(SB), $0
    70  	JEQ ok
    71  needtls:
    72  	// skip TLS setup on Plan 9
    73  	CMPL	runtime·isplan9(SB), $1
    74  	JEQ ok
    75  	// skip TLS setup on Solaris
    76  	CMPL	runtime·issolaris(SB), $1
    77  	JEQ ok
    78  
    79  	LEAQ	runtime·tls0(SB), DI
    80  	CALL	runtime·settls(SB)
    81  
    82  	// store through it, to make sure it works
    83  	get_tls(BX)
    84  	MOVQ	$0x123, g(BX)
    85  	MOVQ	runtime·tls0(SB), AX
    86  	CMPQ	AX, $0x123
    87  	JEQ 2(PC)
    88  	MOVL	AX, 0	// abort
    89  ok:
    90  	// set the per-goroutine and per-mach "registers"
    91  	// tls始终指向g,建立m0和g0的对应关系
    92  	get_tls(BX)
    93  	LEAQ	runtime·g0(SB), CX
    94  	MOVQ	CX, g(BX)
    95  	LEAQ	runtime·m0(SB), AX
    96  
    97  	// save m->g0 = g0
    98  	MOVQ	CX, m_g0(AX)
    99  	// save m0 to g0->m
   100  	MOVQ	AX, g_m(CX)
   101  
   102  	CLD				// convention is D is always left cleared
   103  	CALL	runtime·check(SB)
   104  
   105  	MOVL	16(SP), AX		// copy argc
   106  	MOVL	AX, 0(SP)
   107  	MOVQ	24(SP), AX		// copy argv
   108  	MOVQ	AX, 8(SP)
   109  	CALL	runtime·args(SB)        //   参数初始化:保存参数,从vDSO中获取时间的高效函数
   110  	CALL	runtime·osinit(SB)      //   系统初始化:获取CPU的个数
   111  	CALL	runtime·schedinit(SB)   //   调度初始化
   112  
   113  	// create a new goroutine to start program
   114  	// 创建初始的G runtime·main
   115  	MOVQ	$runtime·mainPC(SB), AX		// entry
   116  	PUSHQ	AX
   117  	PUSHQ	$0			// arg size
   118  	CALL	runtime·newproc(SB)
   119  	POPQ	AX
   120  	POPQ	AX
   121  
   122  	// start this M
   123  	// 开启GMP循环
   124  	CALL	runtime·mstart(SB)
   125  
   126  	MOVL	$0xf1, 0xf1  // crash
   127  	RET
   128  
   129  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   130  GLOBL	runtime·mainPC(SB),RODATA,$8
   131  
   132  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   133  	BYTE	$0xcc
   134  	RET
   135  
   136  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   137  	// No per-thread init.
   138  	RET
   139  
   140  /*
   141   *  go-routine
   142   */
   143  
   144  // void gosave(Gobuf*)
   145  // save state in Gobuf; setjmp
   146  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   147  	MOVQ	buf+0(FP), AX		// gobuf
   148  	LEAQ	buf+0(FP), BX		// caller's SP
   149  	MOVQ	BX, gobuf_sp(AX)
   150  	MOVQ	0(SP), BX		// caller's PC
   151  	MOVQ	BX, gobuf_pc(AX)
   152  	MOVQ	$0, gobuf_ret(AX)
   153  	MOVQ	$0, gobuf_ctxt(AX)
   154  	MOVQ	BP, gobuf_bp(AX)
   155  	get_tls(CX)
   156  	MOVQ	g(CX), BX
   157  	MOVQ	BX, gobuf_g(AX)
   158  	RET
   159  
   160  // void gogo(Gobuf*)
   161  // restore state from Gobuf; longjmp
   162  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   163  	MOVQ	buf+0(FP), BX		// gobuf
   164  	MOVQ	gobuf_g(BX), DX
   165  	MOVQ	0(DX), CX		// make sure g != nil
   166  	get_tls(CX)
   167  	MOVQ	DX, g(CX)
   168  	MOVQ	gobuf_sp(BX), SP	// restore SP
   169  	MOVQ	gobuf_ret(BX), AX
   170  	MOVQ	gobuf_ctxt(BX), DX
   171  	MOVQ	gobuf_bp(BX), BP
   172  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   173  	MOVQ	$0, gobuf_ret(BX)
   174  	MOVQ	$0, gobuf_ctxt(BX)
   175  	MOVQ	$0, gobuf_bp(BX)
   176  	MOVQ	gobuf_pc(BX), BX
   177  	JMP	BX
   178  
   179  // func mcall(fn func(*g))
   180  // Switch to m->g0's stack, call fn(g).
   181  // Fn must never return.  It should gogo(&g->sched)
   182  // to keep running g.
   183  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   184  	MOVQ	fn+0(FP), DI
   185  	
   186  	get_tls(CX)
   187  	MOVQ	g(CX), AX	// save state in g->sched
   188  	MOVQ	0(SP), BX	// caller's PC
   189  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   190  	LEAQ	fn+0(FP), BX	// caller's SP
   191  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   192  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   193  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   194  
   195  	// switch to m->g0 & its stack, call fn
   196  	MOVQ	g(CX), BX
   197  	MOVQ	g_m(BX), BX
   198  	MOVQ	m_g0(BX), SI
   199  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   200  	JNE	3(PC)
   201  	MOVQ	$runtime·badmcall(SB), AX
   202  	JMP	AX
   203  	MOVQ	SI, g(CX)	// g = m->g0
   204  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   205  	PUSHQ	AX
   206  	MOVQ	DI, DX
   207  	MOVQ	0(DI), DI
   208  	CALL	DI
   209  	POPQ	AX
   210  	MOVQ	$runtime·badmcall2(SB), AX
   211  	JMP	AX
   212  	RET
   213  
   214  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   215  // of the G stack.  We need to distinguish the routine that
   216  // lives at the bottom of the G stack from the one that lives
   217  // at the top of the system stack because the one at the top of
   218  // the system stack terminates the stack walk (see topofstack()).
   219  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   220  	RET
   221  
   222  // func systemstack(fn func())
   223  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   224  	MOVQ	fn+0(FP), DI	// DI = fn
   225  	get_tls(CX)
   226  	MOVQ	g(CX), AX	// AX = g
   227  	MOVQ	g_m(AX), BX	// BX = m
   228  
   229  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   230  	CMPQ	AX, DX
   231  	JEQ	noswitch
   232  
   233  	MOVQ	m_g0(BX), DX	// DX = g0
   234  	CMPQ	AX, DX
   235  	JEQ	noswitch
   236  
   237  	MOVQ	m_curg(BX), R8
   238  	CMPQ	AX, R8
   239  	JEQ	switch
   240  	
   241  	// Bad: g is not gsignal, not g0, not curg. What is it?
   242  	MOVQ	$runtime·badsystemstack(SB), AX
   243  	CALL	AX
   244  
   245  switch:
   246  	// save our state in g->sched.  Pretend to
   247  	// be systemstack_switch if the G stack is scanned.
   248  	MOVQ	$runtime·systemstack_switch(SB), SI
   249  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   250  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   251  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   252  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   253  
   254  	// switch to g0
   255  	MOVQ	DX, g(CX)
   256  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   257  	// make it look like mstart called systemstack on g0, to stop traceback
   258  	SUBQ	$8, BX
   259  	MOVQ	$runtime·mstart(SB), DX
   260  	MOVQ	DX, 0(BX)
   261  	MOVQ	BX, SP
   262  
   263  	// call target function
   264  	MOVQ	DI, DX
   265  	MOVQ	0(DI), DI
   266  	CALL	DI
   267  
   268  	// switch back to g
   269  	get_tls(CX)
   270  	MOVQ	g(CX), AX
   271  	MOVQ	g_m(AX), BX
   272  	MOVQ	m_curg(BX), AX
   273  	MOVQ	AX, g(CX)
   274  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   275  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   276  	RET
   277  
   278  noswitch:
   279  	// already on m stack, just call directly
   280  	MOVQ	DI, DX
   281  	MOVQ	0(DI), DI
   282  	CALL	DI
   283  	RET
   284  
   285  /*
   286   * support for morestack
   287   */
   288  
   289  // Called during function prolog when more stack is needed.
   290  //
   291  // The traceback routines see morestack on a g0 as being
   292  // the top of a stack (for example, morestack calling newstack
   293  // calling the scheduler calling newm calling gc), so we must
   294  // record an argument size. For that purpose, it has no arguments.
   295  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   296  	// Cannot grow scheduler stack (m->g0).
   297  	get_tls(CX)
   298  	MOVQ	g(CX), BX
   299  	MOVQ	g_m(BX), BX
   300  	MOVQ	m_g0(BX), SI
   301  	CMPQ	g(CX), SI
   302  	JNE	2(PC)
   303  	INT	$3
   304  
   305  	// Cannot grow signal stack (m->gsignal).
   306  	MOVQ	m_gsignal(BX), SI
   307  	CMPQ	g(CX), SI
   308  	JNE	2(PC)
   309  	INT	$3
   310  
   311  	// Called from f.
   312  	// Set m->morebuf to f's caller.
   313  	MOVQ	8(SP), AX	// f's caller's PC
   314  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   315  	LEAQ	16(SP), AX	// f's caller's SP
   316  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   317  	get_tls(CX)
   318  	MOVQ	g(CX), SI
   319  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   320  
   321  	// Set g->sched to context in f.
   322  	MOVQ	0(SP), AX // f's PC
   323  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   324  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   325  	LEAQ	8(SP), AX // f's SP
   326  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   327  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   328  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   329  
   330  	// Call newstack on m->g0's stack.
   331  	MOVQ	m_g0(BX), BX
   332  	MOVQ	BX, g(CX)
   333  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   334  	CALL	runtime·newstack(SB)
   335  	MOVQ	$0, 0x1003	// crash if newstack returns
   336  	RET
   337  
   338  // morestack but not preserving ctxt.
   339  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   340  	MOVL	$0, DX
   341  	JMP	runtime·morestack(SB)
   342  
   343  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   344  	// We came here via a RET to an overwritten return PC.
   345  	// AX may be live. Other registers are available.
   346  
   347  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   348  	get_tls(CX)
   349  	MOVQ	g(CX), CX
   350  	MOVQ	(g_stkbar+slice_array)(CX), DX
   351  	MOVQ	g_stkbarPos(CX), BX
   352  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   353  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   354  	// Record that this stack barrier was hit.
   355  	ADDQ	$1, g_stkbarPos(CX)
   356  	// Jump to the original return PC.
   357  	JMP	BX
   358  
   359  // reflectcall: call a function with the given argument list
   360  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   361  // we don't have variable-sized frames, so we use a small number
   362  // of constant-sized-frame functions to encode a few bits of size in the pc.
   363  // Caution: ugly multiline assembly macros in your future!
   364  
   365  #define DISPATCH(NAME,MAXSIZE)		\
   366  	CMPQ	CX, $MAXSIZE;		\
   367  	JA	3(PC);			\
   368  	MOVQ	$NAME(SB), AX;		\
   369  	JMP	AX
   370  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   371  
   372  TEXT reflect·call(SB), NOSPLIT, $0-0
   373  	JMP	·reflectcall(SB)
   374  
   375  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   376  	MOVLQZX argsize+24(FP), CX
   377  	// NOTE(rsc): No call16, because CALLFN needs four words
   378  	// of argument space to invoke callwritebarrier.
   379  	DISPATCH(runtime·call32, 32)
   380  	DISPATCH(runtime·call64, 64)
   381  	DISPATCH(runtime·call128, 128)
   382  	DISPATCH(runtime·call256, 256)
   383  	DISPATCH(runtime·call512, 512)
   384  	DISPATCH(runtime·call1024, 1024)
   385  	DISPATCH(runtime·call2048, 2048)
   386  	DISPATCH(runtime·call4096, 4096)
   387  	DISPATCH(runtime·call8192, 8192)
   388  	DISPATCH(runtime·call16384, 16384)
   389  	DISPATCH(runtime·call32768, 32768)
   390  	DISPATCH(runtime·call65536, 65536)
   391  	DISPATCH(runtime·call131072, 131072)
   392  	DISPATCH(runtime·call262144, 262144)
   393  	DISPATCH(runtime·call524288, 524288)
   394  	DISPATCH(runtime·call1048576, 1048576)
   395  	DISPATCH(runtime·call2097152, 2097152)
   396  	DISPATCH(runtime·call4194304, 4194304)
   397  	DISPATCH(runtime·call8388608, 8388608)
   398  	DISPATCH(runtime·call16777216, 16777216)
   399  	DISPATCH(runtime·call33554432, 33554432)
   400  	DISPATCH(runtime·call67108864, 67108864)
   401  	DISPATCH(runtime·call134217728, 134217728)
   402  	DISPATCH(runtime·call268435456, 268435456)
   403  	DISPATCH(runtime·call536870912, 536870912)
   404  	DISPATCH(runtime·call1073741824, 1073741824)
   405  	MOVQ	$runtime·badreflectcall(SB), AX
   406  	JMP	AX
   407  
   408  #define CALLFN(NAME,MAXSIZE)			\
   409  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   410  	NO_LOCAL_POINTERS;			\
   411  	/* copy arguments to stack */		\
   412  	MOVQ	argptr+16(FP), SI;		\
   413  	MOVLQZX argsize+24(FP), CX;		\
   414  	MOVQ	SP, DI;				\
   415  	REP;MOVSB;				\
   416  	/* call function */			\
   417  	MOVQ	f+8(FP), DX;			\
   418  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   419  	CALL	(DX);				\
   420  	/* copy return values back */		\
   421  	MOVQ	argptr+16(FP), DI;		\
   422  	MOVLQZX	argsize+24(FP), CX;		\
   423  	MOVLQZX retoffset+28(FP), BX;		\
   424  	MOVQ	SP, SI;				\
   425  	ADDQ	BX, DI;				\
   426  	ADDQ	BX, SI;				\
   427  	SUBQ	BX, CX;				\
   428  	REP;MOVSB;				\
   429  	/* execute write barrier updates */	\
   430  	MOVQ	argtype+0(FP), DX;		\
   431  	MOVQ	argptr+16(FP), DI;		\
   432  	MOVLQZX	argsize+24(FP), CX;		\
   433  	MOVLQZX retoffset+28(FP), BX;		\
   434  	MOVQ	DX, 0(SP);			\
   435  	MOVQ	DI, 8(SP);			\
   436  	MOVQ	CX, 16(SP);			\
   437  	MOVQ	BX, 24(SP);			\
   438  	CALL	runtime·callwritebarrier(SB);	\
   439  	RET
   440  
   441  CALLFN(·call32, 32)
   442  CALLFN(·call64, 64)
   443  CALLFN(·call128, 128)
   444  CALLFN(·call256, 256)
   445  CALLFN(·call512, 512)
   446  CALLFN(·call1024, 1024)
   447  CALLFN(·call2048, 2048)
   448  CALLFN(·call4096, 4096)
   449  CALLFN(·call8192, 8192)
   450  CALLFN(·call16384, 16384)
   451  CALLFN(·call32768, 32768)
   452  CALLFN(·call65536, 65536)
   453  CALLFN(·call131072, 131072)
   454  CALLFN(·call262144, 262144)
   455  CALLFN(·call524288, 524288)
   456  CALLFN(·call1048576, 1048576)
   457  CALLFN(·call2097152, 2097152)
   458  CALLFN(·call4194304, 4194304)
   459  CALLFN(·call8388608, 8388608)
   460  CALLFN(·call16777216, 16777216)
   461  CALLFN(·call33554432, 33554432)
   462  CALLFN(·call67108864, 67108864)
   463  CALLFN(·call134217728, 134217728)
   464  CALLFN(·call268435456, 268435456)
   465  CALLFN(·call536870912, 536870912)
   466  CALLFN(·call1073741824, 1073741824)
   467  
   468  // bool cas(int32 *val, int32 old, int32 new)
   469  // Atomically:
   470  //	if(*val == old){
   471  //		*val = new;
   472  //		return 1;
   473  //	} else
   474  //		return 0;
   475  TEXT runtime·cas(SB), NOSPLIT, $0-17
   476  	MOVQ	ptr+0(FP), BX
   477  	MOVL	old+8(FP), AX
   478  	MOVL	new+12(FP), CX
   479  	LOCK
   480  	CMPXCHGL	CX, 0(BX)
   481  	SETEQ	ret+16(FP)
   482  	RET
   483  
   484  // bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
   485  // Atomically:
   486  //	if(*val == *old){
   487  //		*val = new;
   488  //		return 1;
   489  //	} else {
   490  //		return 0;
   491  //	}
   492  TEXT runtime·cas64(SB), NOSPLIT, $0-25
   493  	MOVQ	ptr+0(FP), BX
   494  	MOVQ	old+8(FP), AX
   495  	MOVQ	new+16(FP), CX
   496  	LOCK
   497  	CMPXCHGQ	CX, 0(BX)
   498  	SETEQ	ret+24(FP)
   499  	RET
   500  	
   501  TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
   502  	JMP	runtime·cas64(SB)
   503  
   504  TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16
   505  	JMP	runtime·atomicload64(SB)
   506  
   507  TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16
   508  	JMP	runtime·atomicload64(SB)
   509  
   510  TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
   511  	JMP	runtime·atomicstore64(SB)
   512  
   513  // bool casp(void **val, void *old, void *new)
   514  // Atomically:
   515  //	if(*val == old){
   516  //		*val = new;
   517  //		return 1;
   518  //	} else
   519  //		return 0;
   520  TEXT runtime·casp1(SB), NOSPLIT, $0-25
   521  	MOVQ	ptr+0(FP), BX
   522  	MOVQ	old+8(FP), AX
   523  	MOVQ	new+16(FP), CX
   524  	LOCK
   525  	CMPXCHGQ	CX, 0(BX)
   526  	SETEQ	ret+24(FP)
   527  	RET
   528  
   529  // uint32 xadd(uint32 volatile *val, int32 delta)
   530  // Atomically:
   531  //	*val += delta;
   532  //	return *val;
   533  TEXT runtime·xadd(SB), NOSPLIT, $0-20
   534  	MOVQ	ptr+0(FP), BX
   535  	MOVL	delta+8(FP), AX
   536  	MOVL	AX, CX
   537  	LOCK
   538  	XADDL	AX, 0(BX)
   539  	ADDL	CX, AX
   540  	MOVL	AX, ret+16(FP)
   541  	RET
   542  
   543  TEXT runtime·xadd64(SB), NOSPLIT, $0-24
   544  	MOVQ	ptr+0(FP), BX
   545  	MOVQ	delta+8(FP), AX
   546  	MOVQ	AX, CX
   547  	LOCK
   548  	XADDQ	AX, 0(BX)
   549  	ADDQ	CX, AX
   550  	MOVQ	AX, ret+16(FP)
   551  	RET
   552  
   553  TEXT runtime·xadduintptr(SB), NOSPLIT, $0-24
   554  	JMP	runtime·xadd64(SB)
   555  
   556  TEXT runtime·xchg(SB), NOSPLIT, $0-20
   557  	MOVQ	ptr+0(FP), BX
   558  	MOVL	new+8(FP), AX
   559  	XCHGL	AX, 0(BX)
   560  	MOVL	AX, ret+16(FP)
   561  	RET
   562  
   563  TEXT runtime·xchg64(SB), NOSPLIT, $0-24
   564  	MOVQ	ptr+0(FP), BX
   565  	MOVQ	new+8(FP), AX
   566  	XCHGQ	AX, 0(BX)
   567  	MOVQ	AX, ret+16(FP)
   568  	RET
   569  
   570  TEXT runtime·xchgp1(SB), NOSPLIT, $0-24
   571  	MOVQ	ptr+0(FP), BX
   572  	MOVQ	new+8(FP), AX
   573  	XCHGQ	AX, 0(BX)
   574  	MOVQ	AX, ret+16(FP)
   575  	RET
   576  
   577  TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
   578  	JMP	runtime·xchg64(SB)
   579  
   580  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   581  	MOVL	cycles+0(FP), AX
   582  again:
   583  	PAUSE
   584  	SUBL	$1, AX
   585  	JNZ	again
   586  	RET
   587  
   588  TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
   589  	MOVQ	ptr+0(FP), BX
   590  	MOVQ	val+8(FP), AX
   591  	XCHGQ	AX, 0(BX)
   592  	RET
   593  
   594  TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
   595  	MOVQ	ptr+0(FP), BX
   596  	MOVL	val+8(FP), AX
   597  	XCHGL	AX, 0(BX)
   598  	RET
   599  
   600  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
   601  	MOVQ	ptr+0(FP), BX
   602  	MOVQ	val+8(FP), AX
   603  	XCHGQ	AX, 0(BX)
   604  	RET
   605  
   606  // void	runtime·atomicor8(byte volatile*, byte);
   607  TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
   608  	MOVQ	ptr+0(FP), AX
   609  	MOVB	val+8(FP), BX
   610  	LOCK
   611  	ORB	BX, (AX)
   612  	RET
   613  
   614  // void	runtime·atomicand8(byte volatile*, byte);
   615  TEXT runtime·atomicand8(SB), NOSPLIT, $0-9
   616  	MOVQ	ptr+0(FP), AX
   617  	MOVB	val+8(FP), BX
   618  	LOCK
   619  	ANDB	BX, (AX)
   620  	RET
   621  
   622  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   623  	// Stores are already ordered on x86, so this is just a
   624  	// compile barrier.
   625  	RET
   626  
   627  // void jmpdefer(fn, sp);
   628  // called from deferreturn.
   629  // 1. pop the caller
   630  // 2. sub 5 bytes from the callers return
   631  // 3. jmp to the argument
   632  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   633  	MOVQ	fv+0(FP), DX	// fn
   634  	MOVQ	argp+8(FP), BX	// caller sp
   635  	LEAQ	-8(BX), SP	// caller sp after CALL
   636  	SUBQ	$5, (SP)	// return to CALL again
   637  	MOVQ	0(DX), BX
   638  	JMP	BX	// but first run the deferred function
   639  
   640  // Save state of caller into g->sched. Smashes R8, R9.
   641  TEXT gosave<>(SB),NOSPLIT,$0
   642  	get_tls(R8)
   643  	MOVQ	g(R8), R8
   644  	MOVQ	0(SP), R9
   645  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   646  	LEAQ	8(SP), R9
   647  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   648  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   649  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   650  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   651  	RET
   652  
   653  // func asmcgocall(fn, arg unsafe.Pointer) int32
   654  // Call fn(arg) on the scheduler stack,
   655  // aligned appropriately for the gcc ABI.
   656  // See cgocall.go for more details.
   657  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   658  	MOVQ	fn+0(FP), AX
   659  	MOVQ	arg+8(FP), BX
   660  
   661  	MOVQ	SP, DX
   662  
   663  	// Figure out if we need to switch to m->g0 stack.
   664  	// We get called to create new OS threads too, and those
   665  	// come in on the m->g0 stack already.
   666  	get_tls(CX)
   667  	MOVQ	g(CX), R8
   668  	MOVQ	g_m(R8), R8
   669  	MOVQ	m_g0(R8), SI
   670  	MOVQ	g(CX), DI
   671  	CMPQ	SI, DI
   672  	JEQ	nosave
   673  	MOVQ	m_gsignal(R8), SI
   674  	CMPQ	SI, DI
   675  	JEQ	nosave
   676  	
   677  	MOVQ	m_g0(R8), SI
   678  	CALL	gosave<>(SB)
   679  	MOVQ	SI, g(CX)
   680  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   681  nosave:
   682  
   683  	// Now on a scheduling stack (a pthread-created stack).
   684  	// Make sure we have enough room for 4 stack-backed fast-call
   685  	// registers as per windows amd64 calling convention.
   686  	SUBQ	$64, SP
   687  	ANDQ	$~15, SP	// alignment for gcc ABI
   688  	MOVQ	DI, 48(SP)	// save g
   689  	MOVQ	(g_stack+stack_hi)(DI), DI
   690  	SUBQ	DX, DI
   691  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   692  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   693  	MOVQ	BX, CX		// CX = first argument in Win64
   694  	CALL	AX
   695  
   696  	// Restore registers, g, stack pointer.
   697  	get_tls(CX)
   698  	MOVQ	48(SP), DI
   699  	MOVQ	(g_stack+stack_hi)(DI), SI
   700  	SUBQ	40(SP), SI
   701  	MOVQ	DI, g(CX)
   702  	MOVQ	SI, SP
   703  
   704  	MOVL	AX, ret+16(FP)
   705  	RET
   706  
   707  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   708  // Turn the fn into a Go func (by taking its address) and call
   709  // cgocallback_gofunc.
   710  TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
   711  	LEAQ	fn+0(FP), AX
   712  	MOVQ	AX, 0(SP)
   713  	MOVQ	frame+8(FP), AX
   714  	MOVQ	AX, 8(SP)
   715  	MOVQ	framesize+16(FP), AX
   716  	MOVQ	AX, 16(SP)
   717  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   718  	CALL	AX
   719  	RET
   720  
   721  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   722  // See cgocall.go for more details.
   723  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
   724  	NO_LOCAL_POINTERS
   725  
   726  	// If g is nil, Go did not create the current thread.
   727  	// Call needm to obtain one m for temporary use.
   728  	// In this case, we're running on the thread stack, so there's
   729  	// lots of space, but the linker doesn't know. Hide the call from
   730  	// the linker analysis by using an indirect call through AX.
   731  	get_tls(CX)
   732  #ifdef GOOS_windows
   733  	MOVL	$0, BX
   734  	CMPQ	CX, $0
   735  	JEQ	2(PC)
   736  #endif
   737  	MOVQ	g(CX), BX
   738  	CMPQ	BX, $0
   739  	JEQ	needm
   740  	MOVQ	g_m(BX), BX
   741  	MOVQ	BX, R8 // holds oldm until end of function
   742  	JMP	havem
   743  needm:
   744  	MOVQ	$0, 0(SP)
   745  	MOVQ	$runtime·needm(SB), AX
   746  	CALL	AX
   747  	MOVQ	0(SP), R8
   748  	get_tls(CX)
   749  	MOVQ	g(CX), BX
   750  	MOVQ	g_m(BX), BX
   751  	
   752  	// Set m->sched.sp = SP, so that if a panic happens
   753  	// during the function we are about to execute, it will
   754  	// have a valid SP to run on the g0 stack.
   755  	// The next few lines (after the havem label)
   756  	// will save this SP onto the stack and then write
   757  	// the same SP back to m->sched.sp. That seems redundant,
   758  	// but if an unrecovered panic happens, unwindm will
   759  	// restore the g->sched.sp from the stack location
   760  	// and then systemstack will try to use it. If we don't set it here,
   761  	// that restored SP will be uninitialized (typically 0) and
   762  	// will not be usable.
   763  	MOVQ	m_g0(BX), SI
   764  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   765  
   766  havem:
   767  	// Now there's a valid m, and we're running on its m->g0.
   768  	// Save current m->g0->sched.sp on stack and then set it to SP.
   769  	// Save current sp in m->g0->sched.sp in preparation for
   770  	// switch back to m->curg stack.
   771  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   772  	MOVQ	m_g0(BX), SI
   773  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   774  	MOVQ	AX, 0(SP)
   775  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   776  
   777  	// Switch to m->curg stack and call runtime.cgocallbackg.
   778  	// Because we are taking over the execution of m->curg
   779  	// but *not* resuming what had been running, we need to
   780  	// save that information (m->curg->sched) so we can restore it.
   781  	// We can restore m->curg->sched.sp easily, because calling
   782  	// runtime.cgocallbackg leaves SP unchanged upon return.
   783  	// To save m->curg->sched.pc, we push it onto the stack.
   784  	// This has the added benefit that it looks to the traceback
   785  	// routine like cgocallbackg is going to return to that
   786  	// PC (because the frame we allocate below has the same
   787  	// size as cgocallback_gofunc's frame declared above)
   788  	// so that the traceback will seamlessly trace back into
   789  	// the earlier calls.
   790  	//
   791  	// In the new goroutine, 0(SP) holds the saved R8.
   792  	MOVQ	m_curg(BX), SI
   793  	MOVQ	SI, g(CX)
   794  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   795  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   796  	MOVQ	BX, -8(DI)
   797  	// Compute the size of the frame, including return PC and, if
   798  	// GOEXPERIMENT=framepointer, the saved based pointer
   799  	LEAQ	fv+0(FP), AX
   800  	SUBQ	SP, AX
   801  	SUBQ	AX, DI
   802  	MOVQ	DI, SP
   803  
   804  	MOVQ	R8, 0(SP)
   805  	CALL	runtime·cgocallbackg(SB)
   806  	MOVQ	0(SP), R8
   807  
   808  	// Compute the size of the frame again.  FP and SP have
   809  	// completely different values here than they did above,
   810  	// but only their difference matters.
   811  	LEAQ	fv+0(FP), AX
   812  	SUBQ	SP, AX
   813  
   814  	// Restore g->sched (== m->curg->sched) from saved values.
   815  	get_tls(CX)
   816  	MOVQ	g(CX), SI
   817  	MOVQ	SP, DI
   818  	ADDQ	AX, DI
   819  	MOVQ	-8(DI), BX
   820  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   821  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   822  
   823  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   824  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   825  	// so we do not have to restore it.)
   826  	MOVQ	g(CX), BX
   827  	MOVQ	g_m(BX), BX
   828  	MOVQ	m_g0(BX), SI
   829  	MOVQ	SI, g(CX)
   830  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   831  	MOVQ	0(SP), AX
   832  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   833  	
   834  	// If the m on entry was nil, we called needm above to borrow an m
   835  	// for the duration of the call. Since the call is over, return it with dropm.
   836  	CMPQ	R8, $0
   837  	JNE 3(PC)
   838  	MOVQ	$runtime·dropm(SB), AX
   839  	CALL	AX
   840  
   841  	// Done!
   842  	RET
   843  
   844  // void setg(G*); set g. for use by needm.
   845  TEXT runtime·setg(SB), NOSPLIT, $0-8
   846  	MOVQ	gg+0(FP), BX
   847  #ifdef GOOS_windows
   848  	CMPQ	BX, $0
   849  	JNE	settls
   850  	MOVQ	$0, 0x28(GS)
   851  	RET
   852  settls:
   853  	MOVQ	g_m(BX), AX
   854  	LEAQ	m_tls(AX), AX
   855  	MOVQ	AX, 0x28(GS)
   856  #endif
   857  	get_tls(CX)
   858  	MOVQ	BX, g(CX)
   859  	RET
   860  
   861  // void setg_gcc(G*); set g called from gcc.
   862  TEXT setg_gcc<>(SB),NOSPLIT,$0
   863  	get_tls(AX)
   864  	MOVQ	DI, g(AX)
   865  	RET
   866  
   867  // check that SP is in range [g->stack.lo, g->stack.hi)
   868  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   869  	get_tls(CX)
   870  	MOVQ	g(CX), AX
   871  	CMPQ	(g_stack+stack_hi)(AX), SP
   872  	JHI	2(PC)
   873  	INT	$3
   874  	CMPQ	SP, (g_stack+stack_lo)(AX)
   875  	JHI	2(PC)
   876  	INT	$3
   877  	RET
   878  
   879  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   880  	MOVQ	argp+0(FP),AX		// addr of first arg
   881  	MOVQ	-8(AX),AX		// get calling pc
   882  	CMPQ	AX, runtime·stackBarrierPC(SB)
   883  	JNE	nobar
   884  	// Get original return PC.
   885  	CALL	runtime·nextBarrierPC(SB)
   886  	MOVQ	0(SP), AX
   887  nobar:
   888  	MOVQ	AX, ret+8(FP)
   889  	RET
   890  
   891  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   892  	MOVQ	argp+0(FP),AX		// addr of first arg
   893  	MOVQ	pc+8(FP), BX
   894  	MOVQ	-8(AX), CX
   895  	CMPQ	CX, runtime·stackBarrierPC(SB)
   896  	JEQ	setbar
   897  	MOVQ	BX, -8(AX)		// set calling pc
   898  	RET
   899  setbar:
   900  	// Set the stack barrier return PC.
   901  	MOVQ	BX, 0(SP)
   902  	CALL	runtime·setNextBarrierPC(SB)
   903  	RET
   904  
   905  TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   906  	MOVQ	argp+0(FP), AX
   907  	MOVQ	AX, ret+8(FP)
   908  	RET
   909  
   910  // func cputicks() int64
   911  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   912  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   913  	JNE	mfence
   914  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   915  	JMP	done
   916  mfence:
   917  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   918  done:
   919  	RDTSC
   920  	SHLQ	$32, DX
   921  	ADDQ	DX, AX
   922  	MOVQ	AX, ret+0(FP)
   923  	RET
   924  
   925  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   926  // redirects to memhash(p, h, size) using the size
   927  // stored in the closure.
   928  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   929  	GO_ARGS
   930  	NO_LOCAL_POINTERS
   931  	MOVQ	p+0(FP), AX
   932  	MOVQ	h+8(FP), BX
   933  	MOVQ	8(DX), CX
   934  	MOVQ	AX, 0(SP)
   935  	MOVQ	BX, 8(SP)
   936  	MOVQ	CX, 16(SP)
   937  	CALL	runtime·memhash(SB)
   938  	MOVQ	24(SP), AX
   939  	MOVQ	AX, ret+16(FP)
   940  	RET
   941  
   942  // hash function using AES hardware instructions
   943  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   944  	MOVQ	p+0(FP), AX	// ptr to data
   945  	MOVQ	s+16(FP), CX	// size
   946  	LEAQ	ret+24(FP), DX
   947  	JMP	runtime·aeshashbody(SB)
   948  
   949  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   950  	MOVQ	p+0(FP), AX	// ptr to string struct
   951  	MOVQ	8(AX), CX	// length of string
   952  	MOVQ	(AX), AX	// string data
   953  	LEAQ	ret+16(FP), DX
   954  	JMP	runtime·aeshashbody(SB)
   955  
   956  // AX: data
   957  // CX: length
   958  // DX: address to put return value
   959  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   960  	MOVQ	h+8(FP), X6	// seed to low 64 bits of xmm6
   961  	PINSRQ	$1, CX, X6	// size to high 64 bits of xmm6
   962  	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
   963  	MOVO	runtime·aeskeysched(SB), X7
   964  	CMPQ	CX, $16
   965  	JB	aes0to15
   966  	JE	aes16
   967  	CMPQ	CX, $32
   968  	JBE	aes17to32
   969  	CMPQ	CX, $64
   970  	JBE	aes33to64
   971  	CMPQ	CX, $128
   972  	JBE	aes65to128
   973  	JMP	aes129plus
   974  
   975  aes0to15:
   976  	TESTQ	CX, CX
   977  	JE	aes0
   978  
   979  	ADDQ	$16, AX
   980  	TESTW	$0xff0, AX
   981  	JE	endofpage
   982  
   983  	// 16 bytes loaded at this address won't cross
   984  	// a page boundary, so we can load it directly.
   985  	MOVOU	-16(AX), X0
   986  	ADDQ	CX, CX
   987  	MOVQ	$masks<>(SB), AX
   988  	PAND	(AX)(CX*8), X0
   989  
   990  	// scramble 3 times
   991  	AESENC	X6, X0
   992  	AESENC	X7, X0
   993  	AESENC	X7, X0
   994  	MOVQ	X0, (DX)
   995  	RET
   996  
   997  endofpage:
   998  	// address ends in 1111xxxx.  Might be up against
   999  	// a page boundary, so load ending at last byte.
  1000  	// Then shift bytes down using pshufb.
  1001  	MOVOU	-32(AX)(CX*1), X0
  1002  	ADDQ	CX, CX
  1003  	MOVQ	$shifts<>(SB), AX
  1004  	PSHUFB	(AX)(CX*8), X0
  1005  	AESENC	X6, X0
  1006  	AESENC	X7, X0
  1007  	AESENC	X7, X0
  1008  	MOVQ	X0, (DX)
  1009  	RET
  1010  
  1011  aes0:
  1012  	// return input seed
  1013  	MOVQ	h+8(FP), AX
  1014  	MOVQ	AX, (DX)
  1015  	RET
  1016  
  1017  aes16:
  1018  	MOVOU	(AX), X0
  1019  	AESENC	X6, X0
  1020  	AESENC	X7, X0
  1021  	AESENC	X7, X0
  1022  	MOVQ	X0, (DX)
  1023  	RET
  1024  
  1025  aes17to32:
  1026  	// load data to be hashed
  1027  	MOVOU	(AX), X0
  1028  	MOVOU	-16(AX)(CX*1), X1
  1029  
  1030  	// scramble 3 times
  1031  	AESENC	X6, X0
  1032  	AESENC	runtime·aeskeysched+16(SB), X1
  1033  	AESENC	X7, X0
  1034  	AESENC	X7, X1
  1035  	AESENC	X7, X0
  1036  	AESENC	X7, X1
  1037  
  1038  	// combine results
  1039  	PXOR	X1, X0
  1040  	MOVQ	X0, (DX)
  1041  	RET
  1042  
  1043  aes33to64:
  1044  	MOVOU	(AX), X0
  1045  	MOVOU	16(AX), X1
  1046  	MOVOU	-32(AX)(CX*1), X2
  1047  	MOVOU	-16(AX)(CX*1), X3
  1048  	
  1049  	AESENC	X6, X0
  1050  	AESENC	runtime·aeskeysched+16(SB), X1
  1051  	AESENC	runtime·aeskeysched+32(SB), X2
  1052  	AESENC	runtime·aeskeysched+48(SB), X3
  1053  	AESENC	X7, X0
  1054  	AESENC	X7, X1
  1055  	AESENC	X7, X2
  1056  	AESENC	X7, X3
  1057  	AESENC	X7, X0
  1058  	AESENC	X7, X1
  1059  	AESENC	X7, X2
  1060  	AESENC	X7, X3
  1061  
  1062  	PXOR	X2, X0
  1063  	PXOR	X3, X1
  1064  	PXOR	X1, X0
  1065  	MOVQ	X0, (DX)
  1066  	RET
  1067  
  1068  aes65to128:
  1069  	MOVOU	(AX), X0
  1070  	MOVOU	16(AX), X1
  1071  	MOVOU	32(AX), X2
  1072  	MOVOU	48(AX), X3
  1073  	MOVOU	-64(AX)(CX*1), X4
  1074  	MOVOU	-48(AX)(CX*1), X5
  1075  	MOVOU	-32(AX)(CX*1), X8
  1076  	MOVOU	-16(AX)(CX*1), X9
  1077  	
  1078  	AESENC	X6, X0
  1079  	AESENC	runtime·aeskeysched+16(SB), X1
  1080  	AESENC	runtime·aeskeysched+32(SB), X2
  1081  	AESENC	runtime·aeskeysched+48(SB), X3
  1082  	AESENC	runtime·aeskeysched+64(SB), X4
  1083  	AESENC	runtime·aeskeysched+80(SB), X5
  1084  	AESENC	runtime·aeskeysched+96(SB), X8
  1085  	AESENC	runtime·aeskeysched+112(SB), X9
  1086  	AESENC	X7, X0
  1087  	AESENC	X7, X1
  1088  	AESENC	X7, X2
  1089  	AESENC	X7, X3
  1090  	AESENC	X7, X4
  1091  	AESENC	X7, X5
  1092  	AESENC	X7, X8
  1093  	AESENC	X7, X9
  1094  	AESENC	X7, X0
  1095  	AESENC	X7, X1
  1096  	AESENC	X7, X2
  1097  	AESENC	X7, X3
  1098  	AESENC	X7, X4
  1099  	AESENC	X7, X5
  1100  	AESENC	X7, X8
  1101  	AESENC	X7, X9
  1102  
  1103  	PXOR	X4, X0
  1104  	PXOR	X5, X1
  1105  	PXOR	X8, X2
  1106  	PXOR	X9, X3
  1107  	PXOR	X2, X0
  1108  	PXOR	X3, X1
  1109  	PXOR	X1, X0
  1110  	MOVQ	X0, (DX)
  1111  	RET
  1112  
  1113  aes129plus:
  1114  	// start with last (possibly overlapping) block
  1115  	MOVOU	-128(AX)(CX*1), X0
  1116  	MOVOU	-112(AX)(CX*1), X1
  1117  	MOVOU	-96(AX)(CX*1), X2
  1118  	MOVOU	-80(AX)(CX*1), X3
  1119  	MOVOU	-64(AX)(CX*1), X4
  1120  	MOVOU	-48(AX)(CX*1), X5
  1121  	MOVOU	-32(AX)(CX*1), X8
  1122  	MOVOU	-16(AX)(CX*1), X9
  1123  
  1124  	// scramble state once
  1125  	AESENC	X6, X0
  1126  	AESENC	runtime·aeskeysched+16(SB), X1
  1127  	AESENC	runtime·aeskeysched+32(SB), X2
  1128  	AESENC	runtime·aeskeysched+48(SB), X3
  1129  	AESENC	runtime·aeskeysched+64(SB), X4
  1130  	AESENC	runtime·aeskeysched+80(SB), X5
  1131  	AESENC	runtime·aeskeysched+96(SB), X8
  1132  	AESENC	runtime·aeskeysched+112(SB), X9
  1133  
  1134  	// compute number of remaining 128-byte blocks
  1135  	DECQ	CX
  1136  	SHRQ	$7, CX
  1137  	
  1138  aesloop:
  1139  	// scramble state, xor in a block
  1140  	MOVOU	(AX), X10
  1141  	MOVOU	16(AX), X11
  1142  	MOVOU	32(AX), X12
  1143  	MOVOU	48(AX), X13
  1144  	AESENC	X10, X0
  1145  	AESENC	X11, X1
  1146  	AESENC	X12, X2
  1147  	AESENC	X13, X3
  1148  	MOVOU	64(AX), X10
  1149  	MOVOU	80(AX), X11
  1150  	MOVOU	96(AX), X12
  1151  	MOVOU	112(AX), X13
  1152  	AESENC	X10, X4
  1153  	AESENC	X11, X5
  1154  	AESENC	X12, X8
  1155  	AESENC	X13, X9
  1156  
  1157  	// scramble state
  1158  	AESENC	X7, X0
  1159  	AESENC	X7, X1
  1160  	AESENC	X7, X2
  1161  	AESENC	X7, X3
  1162  	AESENC	X7, X4
  1163  	AESENC	X7, X5
  1164  	AESENC	X7, X8
  1165  	AESENC	X7, X9
  1166  
  1167  	ADDQ	$128, AX
  1168  	DECQ	CX
  1169  	JNE	aesloop
  1170  
  1171  	// 2 more scrambles to finish
  1172  	AESENC	X7, X0
  1173  	AESENC	X7, X1
  1174  	AESENC	X7, X2
  1175  	AESENC	X7, X3
  1176  	AESENC	X7, X4
  1177  	AESENC	X7, X5
  1178  	AESENC	X7, X8
  1179  	AESENC	X7, X9
  1180  	AESENC	X7, X0
  1181  	AESENC	X7, X1
  1182  	AESENC	X7, X2
  1183  	AESENC	X7, X3
  1184  	AESENC	X7, X4
  1185  	AESENC	X7, X5
  1186  	AESENC	X7, X8
  1187  	AESENC	X7, X9
  1188  
  1189  	PXOR	X4, X0
  1190  	PXOR	X5, X1
  1191  	PXOR	X8, X2
  1192  	PXOR	X9, X3
  1193  	PXOR	X2, X0
  1194  	PXOR	X3, X1
  1195  	PXOR	X1, X0
  1196  	MOVQ	X0, (DX)
  1197  	RET
  1198  	
  1199  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1200  	MOVQ	p+0(FP), AX	// ptr to data
  1201  	MOVQ	h+8(FP), X0	// seed
  1202  	PINSRD	$2, (AX), X0	// data
  1203  	AESENC	runtime·aeskeysched+0(SB), X0
  1204  	AESENC	runtime·aeskeysched+16(SB), X0
  1205  	AESENC	runtime·aeskeysched+32(SB), X0
  1206  	MOVQ	X0, ret+16(FP)
  1207  	RET
  1208  
  1209  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1210  	MOVQ	p+0(FP), AX	// ptr to data
  1211  	MOVQ	h+8(FP), X0	// seed
  1212  	PINSRQ	$1, (AX), X0	// data
  1213  	AESENC	runtime·aeskeysched+0(SB), X0
  1214  	AESENC	runtime·aeskeysched+16(SB), X0
  1215  	AESENC	runtime·aeskeysched+32(SB), X0
  1216  	MOVQ	X0, ret+16(FP)
  1217  	RET
  1218  
  1219  // simple mask to get rid of data in the high part of the register.
  1220  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1221  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1222  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1223  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1224  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1225  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1226  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1227  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1228  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1229  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1230  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1231  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1232  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1233  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1234  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1235  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1236  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1237  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1238  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1239  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1240  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1241  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1242  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1243  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1244  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1245  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1246  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1247  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1248  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1249  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1250  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1251  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1252  GLOBL masks<>(SB),RODATA,$256
  1253  
  1254  // these are arguments to pshufb.  They move data down from
  1255  // the high bytes of the register to the low bytes of the register.
  1256  // index is how many bytes to move.
  1257  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1258  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1259  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1260  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1261  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1262  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1263  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1264  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1265  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1266  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1267  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1268  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1269  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1270  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1271  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1272  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1273  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1274  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1275  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1276  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1277  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1278  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1279  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1280  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1281  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1282  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1283  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1284  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1285  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1286  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1287  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1288  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1289  GLOBL shifts<>(SB),RODATA,$256
  1290  
  1291  TEXT runtime·memeq(SB),NOSPLIT,$0-25
  1292  	MOVQ	a+0(FP), SI
  1293  	MOVQ	b+8(FP), DI
  1294  	MOVQ	size+16(FP), BX
  1295  	LEAQ	ret+24(FP), AX
  1296  	JMP	runtime·memeqbody(SB)
  1297  
  1298  // memequal_varlen(a, b unsafe.Pointer) bool
  1299  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1300  	MOVQ	a+0(FP), SI
  1301  	MOVQ	b+8(FP), DI
  1302  	CMPQ	SI, DI
  1303  	JEQ	eq
  1304  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1305  	LEAQ	ret+16(FP), AX
  1306  	JMP	runtime·memeqbody(SB)
  1307  eq:
  1308  	MOVB	$1, ret+16(FP)
  1309  	RET
  1310  
  1311  // eqstring tests whether two strings are equal.
  1312  // The compiler guarantees that strings passed
  1313  // to eqstring have equal length.
  1314  // See runtime_test.go:eqstring_generic for
  1315  // equivalent Go code.
  1316  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1317  	MOVQ	s1str+0(FP), SI
  1318  	MOVQ	s2str+16(FP), DI
  1319  	CMPQ	SI, DI
  1320  	JEQ	eq
  1321  	MOVQ	s1len+8(FP), BX
  1322  	LEAQ	v+32(FP), AX
  1323  	JMP	runtime·memeqbody(SB)
  1324  eq:
  1325  	MOVB	$1, v+32(FP)
  1326  	RET
  1327  
  1328  // a in SI
  1329  // b in DI
  1330  // count in BX
  1331  // address of result byte in AX
  1332  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1333  	CMPQ	BX, $8
  1334  	JB	small
  1335  	
  1336  	// 64 bytes at a time using xmm registers
  1337  hugeloop:
  1338  	CMPQ	BX, $64
  1339  	JB	bigloop
  1340  	MOVOU	(SI), X0
  1341  	MOVOU	(DI), X1
  1342  	MOVOU	16(SI), X2
  1343  	MOVOU	16(DI), X3
  1344  	MOVOU	32(SI), X4
  1345  	MOVOU	32(DI), X5
  1346  	MOVOU	48(SI), X6
  1347  	MOVOU	48(DI), X7
  1348  	PCMPEQB	X1, X0
  1349  	PCMPEQB	X3, X2
  1350  	PCMPEQB	X5, X4
  1351  	PCMPEQB	X7, X6
  1352  	PAND	X2, X0
  1353  	PAND	X6, X4
  1354  	PAND	X4, X0
  1355  	PMOVMSKB X0, DX
  1356  	ADDQ	$64, SI
  1357  	ADDQ	$64, DI
  1358  	SUBQ	$64, BX
  1359  	CMPL	DX, $0xffff
  1360  	JEQ	hugeloop
  1361  	MOVB	$0, (AX)
  1362  	RET
  1363  
  1364  	// 8 bytes at a time using 64-bit register
  1365  bigloop:
  1366  	CMPQ	BX, $8
  1367  	JBE	leftover
  1368  	MOVQ	(SI), CX
  1369  	MOVQ	(DI), DX
  1370  	ADDQ	$8, SI
  1371  	ADDQ	$8, DI
  1372  	SUBQ	$8, BX
  1373  	CMPQ	CX, DX
  1374  	JEQ	bigloop
  1375  	MOVB	$0, (AX)
  1376  	RET
  1377  
  1378  	// remaining 0-8 bytes
  1379  leftover:
  1380  	MOVQ	-8(SI)(BX*1), CX
  1381  	MOVQ	-8(DI)(BX*1), DX
  1382  	CMPQ	CX, DX
  1383  	SETEQ	(AX)
  1384  	RET
  1385  
  1386  small:
  1387  	CMPQ	BX, $0
  1388  	JEQ	equal
  1389  
  1390  	LEAQ	0(BX*8), CX
  1391  	NEGQ	CX
  1392  
  1393  	CMPB	SI, $0xf8
  1394  	JA	si_high
  1395  
  1396  	// load at SI won't cross a page boundary.
  1397  	MOVQ	(SI), SI
  1398  	JMP	si_finish
  1399  si_high:
  1400  	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
  1401  	MOVQ	-8(SI)(BX*1), SI
  1402  	SHRQ	CX, SI
  1403  si_finish:
  1404  
  1405  	// same for DI.
  1406  	CMPB	DI, $0xf8
  1407  	JA	di_high
  1408  	MOVQ	(DI), DI
  1409  	JMP	di_finish
  1410  di_high:
  1411  	MOVQ	-8(DI)(BX*1), DI
  1412  	SHRQ	CX, DI
  1413  di_finish:
  1414  
  1415  	SUBQ	SI, DI
  1416  	SHLQ	CX, DI
  1417  equal:
  1418  	SETEQ	(AX)
  1419  	RET
  1420  
  1421  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1422  	MOVQ	s1_base+0(FP), SI
  1423  	MOVQ	s1_len+8(FP), BX
  1424  	MOVQ	s2_base+16(FP), DI
  1425  	MOVQ	s2_len+24(FP), DX
  1426  	LEAQ	ret+32(FP), R9
  1427  	JMP	runtime·cmpbody(SB)
  1428  
  1429  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1430  	MOVQ	s1+0(FP), SI
  1431  	MOVQ	s1+8(FP), BX
  1432  	MOVQ	s2+24(FP), DI
  1433  	MOVQ	s2+32(FP), DX
  1434  	LEAQ	res+48(FP), R9
  1435  	JMP	runtime·cmpbody(SB)
  1436  
  1437  // input:
  1438  //   SI = a
  1439  //   DI = b
  1440  //   BX = alen
  1441  //   DX = blen
  1442  //   R9 = address of output word (stores -1/0/1 here)
  1443  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1444  	CMPQ	SI, DI
  1445  	JEQ	allsame
  1446  	CMPQ	BX, DX
  1447  	MOVQ	DX, R8
  1448  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1449  	CMPQ	R8, $8
  1450  	JB	small
  1451  
  1452  loop:
  1453  	CMPQ	R8, $16
  1454  	JBE	_0through16
  1455  	MOVOU	(SI), X0
  1456  	MOVOU	(DI), X1
  1457  	PCMPEQB X0, X1
  1458  	PMOVMSKB X1, AX
  1459  	XORQ	$0xffff, AX	// convert EQ to NE
  1460  	JNE	diff16	// branch if at least one byte is not equal
  1461  	ADDQ	$16, SI
  1462  	ADDQ	$16, DI
  1463  	SUBQ	$16, R8
  1464  	JMP	loop
  1465  	
  1466  	// AX = bit mask of differences
  1467  diff16:
  1468  	BSFQ	AX, BX	// index of first byte that differs
  1469  	XORQ	AX, AX
  1470  	MOVB	(SI)(BX*1), CX
  1471  	CMPB	CX, (DI)(BX*1)
  1472  	SETHI	AX
  1473  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1474  	MOVQ	AX, (R9)
  1475  	RET
  1476  
  1477  	// 0 through 16 bytes left, alen>=8, blen>=8
  1478  _0through16:
  1479  	CMPQ	R8, $8
  1480  	JBE	_0through8
  1481  	MOVQ	(SI), AX
  1482  	MOVQ	(DI), CX
  1483  	CMPQ	AX, CX
  1484  	JNE	diff8
  1485  _0through8:
  1486  	MOVQ	-8(SI)(R8*1), AX
  1487  	MOVQ	-8(DI)(R8*1), CX
  1488  	CMPQ	AX, CX
  1489  	JEQ	allsame
  1490  
  1491  	// AX and CX contain parts of a and b that differ.
  1492  diff8:
  1493  	BSWAPQ	AX	// reverse order of bytes
  1494  	BSWAPQ	CX
  1495  	XORQ	AX, CX
  1496  	BSRQ	CX, CX	// index of highest bit difference
  1497  	SHRQ	CX, AX	// move a's bit to bottom
  1498  	ANDQ	$1, AX	// mask bit
  1499  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1500  	MOVQ	AX, (R9)
  1501  	RET
  1502  
  1503  	// 0-7 bytes in common
  1504  small:
  1505  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1506  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1507  	JEQ	allsame
  1508  
  1509  	// load bytes of a into high bytes of AX
  1510  	CMPB	SI, $0xf8
  1511  	JA	si_high
  1512  	MOVQ	(SI), SI
  1513  	JMP	si_finish
  1514  si_high:
  1515  	MOVQ	-8(SI)(R8*1), SI
  1516  	SHRQ	CX, SI
  1517  si_finish:
  1518  	SHLQ	CX, SI
  1519  
  1520  	// load bytes of b in to high bytes of BX
  1521  	CMPB	DI, $0xf8
  1522  	JA	di_high
  1523  	MOVQ	(DI), DI
  1524  	JMP	di_finish
  1525  di_high:
  1526  	MOVQ	-8(DI)(R8*1), DI
  1527  	SHRQ	CX, DI
  1528  di_finish:
  1529  	SHLQ	CX, DI
  1530  
  1531  	BSWAPQ	SI	// reverse order of bytes
  1532  	BSWAPQ	DI
  1533  	XORQ	SI, DI	// find bit differences
  1534  	JEQ	allsame
  1535  	BSRQ	DI, CX	// index of highest bit difference
  1536  	SHRQ	CX, SI	// move a's bit to bottom
  1537  	ANDQ	$1, SI	// mask bit
  1538  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1539  	MOVQ	AX, (R9)
  1540  	RET
  1541  
  1542  allsame:
  1543  	XORQ	AX, AX
  1544  	XORQ	CX, CX
  1545  	CMPQ	BX, DX
  1546  	SETGT	AX	// 1 if alen > blen
  1547  	SETEQ	CX	// 1 if alen == blen
  1548  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1549  	MOVQ	AX, (R9)
  1550  	RET
  1551  
  1552  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1553  	MOVQ s+0(FP), SI
  1554  	MOVQ s_len+8(FP), BX
  1555  	MOVB c+24(FP), AL
  1556  	LEAQ ret+32(FP), R8
  1557  	JMP  runtime·indexbytebody(SB)
  1558  
  1559  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1560  	MOVQ s+0(FP), SI
  1561  	MOVQ s_len+8(FP), BX
  1562  	MOVB c+16(FP), AL
  1563  	LEAQ ret+24(FP), R8
  1564  	JMP  runtime·indexbytebody(SB)
  1565  
  1566  // input:
  1567  //   SI: data
  1568  //   BX: data len
  1569  //   AL: byte sought
  1570  //   R8: address to put result
  1571  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1572  	MOVQ SI, DI
  1573  
  1574  	CMPQ BX, $16
  1575  	JLT small
  1576  
  1577  	// round up to first 16-byte boundary
  1578  	TESTQ $15, SI
  1579  	JZ aligned
  1580  	MOVQ SI, CX
  1581  	ANDQ $~15, CX
  1582  	ADDQ $16, CX
  1583  
  1584  	// search the beginning
  1585  	SUBQ SI, CX
  1586  	REPN; SCASB
  1587  	JZ success
  1588  
  1589  // DI is 16-byte aligned; get ready to search using SSE instructions
  1590  aligned:
  1591  	// round down to last 16-byte boundary
  1592  	MOVQ BX, R11
  1593  	ADDQ SI, R11
  1594  	ANDQ $~15, R11
  1595  
  1596  	// shuffle X0 around so that each byte contains c
  1597  	MOVD AX, X0
  1598  	PUNPCKLBW X0, X0
  1599  	PUNPCKLBW X0, X0
  1600  	PSHUFL $0, X0, X0
  1601  	JMP condition
  1602  
  1603  sse:
  1604  	// move the next 16-byte chunk of the buffer into X1
  1605  	MOVO (DI), X1
  1606  	// compare bytes in X0 to X1
  1607  	PCMPEQB X0, X1
  1608  	// take the top bit of each byte in X1 and put the result in DX
  1609  	PMOVMSKB X1, DX
  1610  	TESTL DX, DX
  1611  	JNZ ssesuccess
  1612  	ADDQ $16, DI
  1613  
  1614  condition:
  1615  	CMPQ DI, R11
  1616  	JLT sse
  1617  
  1618  	// search the end
  1619  	MOVQ SI, CX
  1620  	ADDQ BX, CX
  1621  	SUBQ R11, CX
  1622  	// if CX == 0, the zero flag will be set and we'll end up
  1623  	// returning a false success
  1624  	JZ failure
  1625  	REPN; SCASB
  1626  	JZ success
  1627  
  1628  failure:
  1629  	MOVQ $-1, (R8)
  1630  	RET
  1631  
  1632  // handle for lengths < 16
  1633  small:
  1634  	MOVQ BX, CX
  1635  	REPN; SCASB
  1636  	JZ success
  1637  	MOVQ $-1, (R8)
  1638  	RET
  1639  
  1640  // we've found the chunk containing the byte
  1641  // now just figure out which specific byte it is
  1642  ssesuccess:
  1643  	// get the index of the least significant set bit
  1644  	BSFW DX, DX
  1645  	SUBQ SI, DI
  1646  	ADDQ DI, DX
  1647  	MOVQ DX, (R8)
  1648  	RET
  1649  
  1650  success:
  1651  	SUBQ SI, DI
  1652  	SUBL $1, DI
  1653  	MOVQ DI, (R8)
  1654  	RET
  1655  
  1656  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  1657  	MOVQ	a_len+8(FP), BX
  1658  	MOVQ	b_len+32(FP), CX
  1659  	CMPQ	BX, CX
  1660  	JNE	eqret
  1661  	MOVQ	a+0(FP), SI
  1662  	MOVQ	b+24(FP), DI
  1663  	LEAQ	ret+48(FP), AX
  1664  	JMP	runtime·memeqbody(SB)
  1665  eqret:
  1666  	MOVB	$0, ret+48(FP)
  1667  	RET
  1668  
  1669  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1670  	get_tls(CX)
  1671  	MOVQ	g(CX), AX
  1672  	MOVQ	g_m(AX), AX
  1673  	MOVL	m_fastrand(AX), DX
  1674  	ADDL	DX, DX
  1675  	MOVL	DX, BX
  1676  	XORL	$0x88888eef, DX
  1677  	CMOVLMI	BX, DX
  1678  	MOVL	DX, m_fastrand(AX)
  1679  	MOVL	DX, ret+0(FP)
  1680  	RET
  1681  
  1682  TEXT runtime·return0(SB), NOSPLIT, $0
  1683  	MOVL	$0, AX
  1684  	RET
  1685  
  1686  
  1687  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1688  // Must obey the gcc calling convention.
  1689  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1690  	get_tls(CX)
  1691  	MOVQ	g(CX), AX
  1692  	MOVQ	g_m(AX), AX
  1693  	MOVQ	m_curg(AX), AX
  1694  	MOVQ	(g_stack+stack_hi)(AX), AX
  1695  	RET
  1696  
  1697  // The top-most function running on a goroutine
  1698  // returns to goexit+PCQuantum.
  1699  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1700  	BYTE	$0x90	// NOP
  1701  	CALL	runtime·goexit1(SB)	// does not return
  1702  	// traceback from goexit1 must hit code range of goexit
  1703  	BYTE	$0x90	// NOP
  1704  
  1705  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  1706  	MOVQ	addr+0(FP), AX
  1707  	PREFETCHT0	(AX)
  1708  	RET
  1709  
  1710  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  1711  	MOVQ	addr+0(FP), AX
  1712  	PREFETCHT1	(AX)
  1713  	RET
  1714  
  1715  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  1716  	MOVQ	addr+0(FP), AX
  1717  	PREFETCHT2	(AX)
  1718  	RET
  1719  
  1720  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  1721  	MOVQ	addr+0(FP), AX
  1722  	PREFETCHNTA	(AX)
  1723  	RET
  1724  
  1725  // This is called from .init_array and follows the platform, not Go, ABI.
  1726  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1727  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  1728  	MOVQ	runtime·lastmoduledatap(SB), AX
  1729  	MOVQ	DI, moduledata_next(AX)
  1730  	MOVQ	DI, runtime·lastmoduledatap(SB)
  1731  	POPQ	R15
  1732  	RET