github.com/peggyl/go@v0.0.0-20151008231540-ae315999c2d5/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	CMPQ	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  
    46  	MOVQ	$1, AX
    47  	CPUID
    48  	MOVL	CX, runtime·cpuid_ecx(SB)
    49  	MOVL	DX, runtime·cpuid_edx(SB)
    50  nocpuinfo:	
    51  	
    52  	// if there is an _cgo_init, call it.
    53  	MOVQ	_cgo_init(SB), AX
    54  	TESTQ	AX, AX
    55  	JZ	needtls
    56  	// g0 already in DI
    57  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    58  	MOVQ	$setg_gcc<>(SB), SI
    59  	CALL	AX
    60  
    61  	// update stackguard after _cgo_init
    62  	MOVQ	$runtime·g0(SB), CX
    63  	MOVQ	(g_stack+stack_lo)(CX), AX
    64  	ADDQ	$const__StackGuard, AX
    65  	MOVQ	AX, g_stackguard0(CX)
    66  	MOVQ	AX, g_stackguard1(CX)
    67  
    68  	CMPL	runtime·iswindows(SB), $0
    69  	JEQ ok
    70  needtls:
    71  	// skip TLS setup on Plan 9
    72  	CMPL	runtime·isplan9(SB), $1
    73  	JEQ ok
    74  	// skip TLS setup on Solaris
    75  	CMPL	runtime·issolaris(SB), $1
    76  	JEQ ok
    77  
    78  	LEAQ	runtime·tls0(SB), DI
    79  	CALL	runtime·settls(SB)
    80  
    81  	// store through it, to make sure it works
    82  	get_tls(BX)
    83  	MOVQ	$0x123, g(BX)
    84  	MOVQ	runtime·tls0(SB), AX
    85  	CMPQ	AX, $0x123
    86  	JEQ 2(PC)
    87  	MOVL	AX, 0	// abort
    88  ok:
    89  	// set the per-goroutine and per-mach "registers"
    90  	get_tls(BX)
    91  	LEAQ	runtime·g0(SB), CX
    92  	MOVQ	CX, g(BX)
    93  	LEAQ	runtime·m0(SB), AX
    94  
    95  	// save m->g0 = g0
    96  	MOVQ	CX, m_g0(AX)
    97  	// save m0 to g0->m
    98  	MOVQ	AX, g_m(CX)
    99  
   100  	CLD				// convention is D is always left cleared
   101  	CALL	runtime·check(SB)
   102  
   103  	MOVL	16(SP), AX		// copy argc
   104  	MOVL	AX, 0(SP)
   105  	MOVQ	24(SP), AX		// copy argv
   106  	MOVQ	AX, 8(SP)
   107  	CALL	runtime·args(SB)
   108  	CALL	runtime·osinit(SB)
   109  	CALL	runtime·schedinit(SB)
   110  
   111  	// create a new goroutine to start program
   112  	MOVQ	$runtime·mainPC(SB), AX		// entry
   113  	PUSHQ	AX
   114  	PUSHQ	$0			// arg size
   115  	CALL	runtime·newproc(SB)
   116  	POPQ	AX
   117  	POPQ	AX
   118  
   119  	// start this M
   120  	CALL	runtime·mstart(SB)
   121  
   122  	MOVL	$0xf1, 0xf1  // crash
   123  	RET
   124  
   125  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   126  GLOBL	runtime·mainPC(SB),RODATA,$8
   127  
   128  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   129  	BYTE	$0xcc
   130  	RET
   131  
   132  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   133  	// No per-thread init.
   134  	RET
   135  
   136  /*
   137   *  go-routine
   138   */
   139  
   140  // void gosave(Gobuf*)
   141  // save state in Gobuf; setjmp
   142  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   143  	MOVQ	buf+0(FP), AX		// gobuf
   144  	LEAQ	buf+0(FP), BX		// caller's SP
   145  	MOVQ	BX, gobuf_sp(AX)
   146  	MOVQ	0(SP), BX		// caller's PC
   147  	MOVQ	BX, gobuf_pc(AX)
   148  	MOVQ	$0, gobuf_ret(AX)
   149  	MOVQ	$0, gobuf_ctxt(AX)
   150  	MOVQ	BP, gobuf_bp(AX)
   151  	get_tls(CX)
   152  	MOVQ	g(CX), BX
   153  	MOVQ	BX, gobuf_g(AX)
   154  	RET
   155  
   156  // void gogo(Gobuf*)
   157  // restore state from Gobuf; longjmp
   158  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   159  	MOVQ	buf+0(FP), BX		// gobuf
   160  	MOVQ	gobuf_g(BX), DX
   161  	MOVQ	0(DX), CX		// make sure g != nil
   162  	get_tls(CX)
   163  	MOVQ	DX, g(CX)
   164  	MOVQ	gobuf_sp(BX), SP	// restore SP
   165  	MOVQ	gobuf_ret(BX), AX
   166  	MOVQ	gobuf_ctxt(BX), DX
   167  	MOVQ	gobuf_bp(BX), BP
   168  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   169  	MOVQ	$0, gobuf_ret(BX)
   170  	MOVQ	$0, gobuf_ctxt(BX)
   171  	MOVQ	$0, gobuf_bp(BX)
   172  	MOVQ	gobuf_pc(BX), BX
   173  	JMP	BX
   174  
   175  // func mcall(fn func(*g))
   176  // Switch to m->g0's stack, call fn(g).
   177  // Fn must never return.  It should gogo(&g->sched)
   178  // to keep running g.
   179  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   180  	MOVQ	fn+0(FP), DI
   181  	
   182  	get_tls(CX)
   183  	MOVQ	g(CX), AX	// save state in g->sched
   184  	MOVQ	0(SP), BX	// caller's PC
   185  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   186  	LEAQ	fn+0(FP), BX	// caller's SP
   187  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   188  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   189  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   190  
   191  	// switch to m->g0 & its stack, call fn
   192  	MOVQ	g(CX), BX
   193  	MOVQ	g_m(BX), BX
   194  	MOVQ	m_g0(BX), SI
   195  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   196  	JNE	3(PC)
   197  	MOVQ	$runtime·badmcall(SB), AX
   198  	JMP	AX
   199  	MOVQ	SI, g(CX)	// g = m->g0
   200  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   201  	PUSHQ	AX
   202  	MOVQ	DI, DX
   203  	MOVQ	0(DI), DI
   204  	CALL	DI
   205  	POPQ	AX
   206  	MOVQ	$runtime·badmcall2(SB), AX
   207  	JMP	AX
   208  	RET
   209  
   210  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   211  // of the G stack.  We need to distinguish the routine that
   212  // lives at the bottom of the G stack from the one that lives
   213  // at the top of the system stack because the one at the top of
   214  // the system stack terminates the stack walk (see topofstack()).
   215  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   216  	RET
   217  
   218  // func systemstack(fn func())
   219  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   220  	MOVQ	fn+0(FP), DI	// DI = fn
   221  	get_tls(CX)
   222  	MOVQ	g(CX), AX	// AX = g
   223  	MOVQ	g_m(AX), BX	// BX = m
   224  
   225  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   226  	CMPQ	AX, DX
   227  	JEQ	noswitch
   228  
   229  	MOVQ	m_g0(BX), DX	// DX = g0
   230  	CMPQ	AX, DX
   231  	JEQ	noswitch
   232  
   233  	MOVQ	m_curg(BX), R8
   234  	CMPQ	AX, R8
   235  	JEQ	switch
   236  	
   237  	// Bad: g is not gsignal, not g0, not curg. What is it?
   238  	MOVQ	$runtime·badsystemstack(SB), AX
   239  	CALL	AX
   240  
   241  switch:
   242  	// save our state in g->sched.  Pretend to
   243  	// be systemstack_switch if the G stack is scanned.
   244  	MOVQ	$runtime·systemstack_switch(SB), SI
   245  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   246  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   247  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   248  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   249  
   250  	// switch to g0
   251  	MOVQ	DX, g(CX)
   252  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   253  	// make it look like mstart called systemstack on g0, to stop traceback
   254  	SUBQ	$8, BX
   255  	MOVQ	$runtime·mstart(SB), DX
   256  	MOVQ	DX, 0(BX)
   257  	MOVQ	BX, SP
   258  
   259  	// call target function
   260  	MOVQ	DI, DX
   261  	MOVQ	0(DI), DI
   262  	CALL	DI
   263  
   264  	// switch back to g
   265  	get_tls(CX)
   266  	MOVQ	g(CX), AX
   267  	MOVQ	g_m(AX), BX
   268  	MOVQ	m_curg(BX), AX
   269  	MOVQ	AX, g(CX)
   270  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   271  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   272  	RET
   273  
   274  noswitch:
   275  	// already on m stack, just call directly
   276  	MOVQ	DI, DX
   277  	MOVQ	0(DI), DI
   278  	CALL	DI
   279  	RET
   280  
   281  /*
   282   * support for morestack
   283   */
   284  
   285  // Called during function prolog when more stack is needed.
   286  //
   287  // The traceback routines see morestack on a g0 as being
   288  // the top of a stack (for example, morestack calling newstack
   289  // calling the scheduler calling newm calling gc), so we must
   290  // record an argument size. For that purpose, it has no arguments.
   291  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   292  	// Cannot grow scheduler stack (m->g0).
   293  	get_tls(CX)
   294  	MOVQ	g(CX), BX
   295  	MOVQ	g_m(BX), BX
   296  	MOVQ	m_g0(BX), SI
   297  	CMPQ	g(CX), SI
   298  	JNE	2(PC)
   299  	INT	$3
   300  
   301  	// Cannot grow signal stack (m->gsignal).
   302  	MOVQ	m_gsignal(BX), SI
   303  	CMPQ	g(CX), SI
   304  	JNE	2(PC)
   305  	INT	$3
   306  
   307  	// Called from f.
   308  	// Set m->morebuf to f's caller.
   309  	MOVQ	8(SP), AX	// f's caller's PC
   310  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   311  	LEAQ	16(SP), AX	// f's caller's SP
   312  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   313  	get_tls(CX)
   314  	MOVQ	g(CX), SI
   315  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   316  
   317  	// Set g->sched to context in f.
   318  	MOVQ	0(SP), AX // f's PC
   319  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   320  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   321  	LEAQ	8(SP), AX // f's SP
   322  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   323  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   324  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   325  
   326  	// Call newstack on m->g0's stack.
   327  	MOVQ	m_g0(BX), BX
   328  	MOVQ	BX, g(CX)
   329  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   330  	CALL	runtime·newstack(SB)
   331  	MOVQ	$0, 0x1003	// crash if newstack returns
   332  	RET
   333  
   334  // morestack but not preserving ctxt.
   335  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   336  	MOVL	$0, DX
   337  	JMP	runtime·morestack(SB)
   338  
   339  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   340  	// We came here via a RET to an overwritten return PC.
   341  	// AX may be live. Other registers are available.
   342  
   343  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   344  	get_tls(CX)
   345  	MOVQ	g(CX), CX
   346  	MOVQ	(g_stkbar+slice_array)(CX), DX
   347  	MOVQ	g_stkbarPos(CX), BX
   348  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   349  	MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
   350  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   351  	// Assert that we're popping the right saved LR.
   352  	CMPQ	R8, SP
   353  	JNE	2(PC)
   354  	MOVL	$0, 0
   355  	// Record that this stack barrier was hit.
   356  	ADDQ	$1, g_stkbarPos(CX)
   357  	// Jump to the original return PC.
   358  	JMP	BX
   359  
   360  // reflectcall: call a function with the given argument list
   361  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   362  // we don't have variable-sized frames, so we use a small number
   363  // of constant-sized-frame functions to encode a few bits of size in the pc.
   364  // Caution: ugly multiline assembly macros in your future!
   365  
   366  #define DISPATCH(NAME,MAXSIZE)		\
   367  	CMPQ	CX, $MAXSIZE;		\
   368  	JA	3(PC);			\
   369  	MOVQ	$NAME(SB), AX;		\
   370  	JMP	AX
   371  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   372  
   373  TEXT reflect·call(SB), NOSPLIT, $0-0
   374  	JMP	·reflectcall(SB)
   375  
   376  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   377  	MOVLQZX argsize+24(FP), CX
   378  	// NOTE(rsc): No call16, because CALLFN needs four words
   379  	// of argument space to invoke callwritebarrier.
   380  	DISPATCH(runtime·call32, 32)
   381  	DISPATCH(runtime·call64, 64)
   382  	DISPATCH(runtime·call128, 128)
   383  	DISPATCH(runtime·call256, 256)
   384  	DISPATCH(runtime·call512, 512)
   385  	DISPATCH(runtime·call1024, 1024)
   386  	DISPATCH(runtime·call2048, 2048)
   387  	DISPATCH(runtime·call4096, 4096)
   388  	DISPATCH(runtime·call8192, 8192)
   389  	DISPATCH(runtime·call16384, 16384)
   390  	DISPATCH(runtime·call32768, 32768)
   391  	DISPATCH(runtime·call65536, 65536)
   392  	DISPATCH(runtime·call131072, 131072)
   393  	DISPATCH(runtime·call262144, 262144)
   394  	DISPATCH(runtime·call524288, 524288)
   395  	DISPATCH(runtime·call1048576, 1048576)
   396  	DISPATCH(runtime·call2097152, 2097152)
   397  	DISPATCH(runtime·call4194304, 4194304)
   398  	DISPATCH(runtime·call8388608, 8388608)
   399  	DISPATCH(runtime·call16777216, 16777216)
   400  	DISPATCH(runtime·call33554432, 33554432)
   401  	DISPATCH(runtime·call67108864, 67108864)
   402  	DISPATCH(runtime·call134217728, 134217728)
   403  	DISPATCH(runtime·call268435456, 268435456)
   404  	DISPATCH(runtime·call536870912, 536870912)
   405  	DISPATCH(runtime·call1073741824, 1073741824)
   406  	MOVQ	$runtime·badreflectcall(SB), AX
   407  	JMP	AX
   408  
   409  #define CALLFN(NAME,MAXSIZE)			\
   410  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   411  	NO_LOCAL_POINTERS;			\
   412  	/* copy arguments to stack */		\
   413  	MOVQ	argptr+16(FP), SI;		\
   414  	MOVLQZX argsize+24(FP), CX;		\
   415  	MOVQ	SP, DI;				\
   416  	REP;MOVSB;				\
   417  	/* call function */			\
   418  	MOVQ	f+8(FP), DX;			\
   419  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   420  	CALL	(DX);				\
   421  	/* copy return values back */		\
   422  	MOVQ	argptr+16(FP), DI;		\
   423  	MOVLQZX	argsize+24(FP), CX;		\
   424  	MOVLQZX retoffset+28(FP), BX;		\
   425  	MOVQ	SP, SI;				\
   426  	ADDQ	BX, DI;				\
   427  	ADDQ	BX, SI;				\
   428  	SUBQ	BX, CX;				\
   429  	REP;MOVSB;				\
   430  	/* execute write barrier updates */	\
   431  	MOVQ	argtype+0(FP), DX;		\
   432  	MOVQ	argptr+16(FP), DI;		\
   433  	MOVLQZX	argsize+24(FP), CX;		\
   434  	MOVLQZX retoffset+28(FP), BX;		\
   435  	MOVQ	DX, 0(SP);			\
   436  	MOVQ	DI, 8(SP);			\
   437  	MOVQ	CX, 16(SP);			\
   438  	MOVQ	BX, 24(SP);			\
   439  	CALL	runtime·callwritebarrier(SB);	\
   440  	RET
   441  
   442  CALLFN(·call32, 32)
   443  CALLFN(·call64, 64)
   444  CALLFN(·call128, 128)
   445  CALLFN(·call256, 256)
   446  CALLFN(·call512, 512)
   447  CALLFN(·call1024, 1024)
   448  CALLFN(·call2048, 2048)
   449  CALLFN(·call4096, 4096)
   450  CALLFN(·call8192, 8192)
   451  CALLFN(·call16384, 16384)
   452  CALLFN(·call32768, 32768)
   453  CALLFN(·call65536, 65536)
   454  CALLFN(·call131072, 131072)
   455  CALLFN(·call262144, 262144)
   456  CALLFN(·call524288, 524288)
   457  CALLFN(·call1048576, 1048576)
   458  CALLFN(·call2097152, 2097152)
   459  CALLFN(·call4194304, 4194304)
   460  CALLFN(·call8388608, 8388608)
   461  CALLFN(·call16777216, 16777216)
   462  CALLFN(·call33554432, 33554432)
   463  CALLFN(·call67108864, 67108864)
   464  CALLFN(·call134217728, 134217728)
   465  CALLFN(·call268435456, 268435456)
   466  CALLFN(·call536870912, 536870912)
   467  CALLFN(·call1073741824, 1073741824)
   468  
   469  // bool cas(int32 *val, int32 old, int32 new)
   470  // Atomically:
   471  //	if(*val == old){
   472  //		*val = new;
   473  //		return 1;
   474  //	} else
   475  //		return 0;
   476  TEXT runtime·cas(SB), NOSPLIT, $0-17
   477  	MOVQ	ptr+0(FP), BX
   478  	MOVL	old+8(FP), AX
   479  	MOVL	new+12(FP), CX
   480  	LOCK
   481  	CMPXCHGL	CX, 0(BX)
   482  	SETEQ	ret+16(FP)
   483  	RET
   484  
   485  // bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
   486  // Atomically:
   487  //	if(*val == *old){
   488  //		*val = new;
   489  //		return 1;
   490  //	} else {
   491  //		return 0;
   492  //	}
   493  TEXT runtime·cas64(SB), NOSPLIT, $0-25
   494  	MOVQ	ptr+0(FP), BX
   495  	MOVQ	old+8(FP), AX
   496  	MOVQ	new+16(FP), CX
   497  	LOCK
   498  	CMPXCHGQ	CX, 0(BX)
   499  	SETEQ	ret+24(FP)
   500  	RET
   501  	
   502  TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
   503  	JMP	runtime·cas64(SB)
   504  
   505  TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16
   506  	JMP	runtime·atomicload64(SB)
   507  
   508  TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16
   509  	JMP	runtime·atomicload64(SB)
   510  
   511  TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
   512  	JMP	runtime·atomicstore64(SB)
   513  
   514  // bool casp(void **val, void *old, void *new)
   515  // Atomically:
   516  //	if(*val == old){
   517  //		*val = new;
   518  //		return 1;
   519  //	} else
   520  //		return 0;
   521  TEXT runtime·casp1(SB), NOSPLIT, $0-25
   522  	MOVQ	ptr+0(FP), BX
   523  	MOVQ	old+8(FP), AX
   524  	MOVQ	new+16(FP), CX
   525  	LOCK
   526  	CMPXCHGQ	CX, 0(BX)
   527  	SETEQ	ret+24(FP)
   528  	RET
   529  
   530  // uint32 xadd(uint32 volatile *val, int32 delta)
   531  // Atomically:
   532  //	*val += delta;
   533  //	return *val;
   534  TEXT runtime·xadd(SB), NOSPLIT, $0-20
   535  	MOVQ	ptr+0(FP), BX
   536  	MOVL	delta+8(FP), AX
   537  	MOVL	AX, CX
   538  	LOCK
   539  	XADDL	AX, 0(BX)
   540  	ADDL	CX, AX
   541  	MOVL	AX, ret+16(FP)
   542  	RET
   543  
   544  TEXT runtime·xadd64(SB), NOSPLIT, $0-24
   545  	MOVQ	ptr+0(FP), BX
   546  	MOVQ	delta+8(FP), AX
   547  	MOVQ	AX, CX
   548  	LOCK
   549  	XADDQ	AX, 0(BX)
   550  	ADDQ	CX, AX
   551  	MOVQ	AX, ret+16(FP)
   552  	RET
   553  
   554  TEXT runtime·xadduintptr(SB), NOSPLIT, $0-24
   555  	JMP	runtime·xadd64(SB)
   556  
   557  TEXT runtime·xchg(SB), NOSPLIT, $0-20
   558  	MOVQ	ptr+0(FP), BX
   559  	MOVL	new+8(FP), AX
   560  	XCHGL	AX, 0(BX)
   561  	MOVL	AX, ret+16(FP)
   562  	RET
   563  
   564  TEXT runtime·xchg64(SB), NOSPLIT, $0-24
   565  	MOVQ	ptr+0(FP), BX
   566  	MOVQ	new+8(FP), AX
   567  	XCHGQ	AX, 0(BX)
   568  	MOVQ	AX, ret+16(FP)
   569  	RET
   570  
   571  TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
   572  	JMP	runtime·xchg64(SB)
   573  
   574  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   575  	MOVL	cycles+0(FP), AX
   576  again:
   577  	PAUSE
   578  	SUBL	$1, AX
   579  	JNZ	again
   580  	RET
   581  
   582  TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
   583  	MOVQ	ptr+0(FP), BX
   584  	MOVQ	val+8(FP), AX
   585  	XCHGQ	AX, 0(BX)
   586  	RET
   587  
   588  TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
   589  	MOVQ	ptr+0(FP), BX
   590  	MOVL	val+8(FP), AX
   591  	XCHGL	AX, 0(BX)
   592  	RET
   593  
   594  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
   595  	MOVQ	ptr+0(FP), BX
   596  	MOVQ	val+8(FP), AX
   597  	XCHGQ	AX, 0(BX)
   598  	RET
   599  
   600  // void	runtime·atomicor8(byte volatile*, byte);
   601  TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
   602  	MOVQ	ptr+0(FP), AX
   603  	MOVB	val+8(FP), BX
   604  	LOCK
   605  	ORB	BX, (AX)
   606  	RET
   607  
   608  // void	runtime·atomicand8(byte volatile*, byte);
   609  TEXT runtime·atomicand8(SB), NOSPLIT, $0-9
   610  	MOVQ	ptr+0(FP), AX
   611  	MOVB	val+8(FP), BX
   612  	LOCK
   613  	ANDB	BX, (AX)
   614  	RET
   615  
   616  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   617  	// Stores are already ordered on x86, so this is just a
   618  	// compile barrier.
   619  	RET
   620  
   621  // void jmpdefer(fn, sp);
   622  // called from deferreturn.
   623  // 1. pop the caller
   624  // 2. sub 5 bytes from the callers return
   625  // 3. jmp to the argument
   626  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   627  	MOVQ	fv+0(FP), DX	// fn
   628  	MOVQ	argp+8(FP), BX	// caller sp
   629  	LEAQ	-8(BX), SP	// caller sp after CALL
   630  	SUBQ	$5, (SP)	// return to CALL again
   631  	MOVQ	0(DX), BX
   632  	JMP	BX	// but first run the deferred function
   633  
   634  // Save state of caller into g->sched. Smashes R8, R9.
   635  TEXT gosave<>(SB),NOSPLIT,$0
   636  	get_tls(R8)
   637  	MOVQ	g(R8), R8
   638  	MOVQ	0(SP), R9
   639  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   640  	LEAQ	8(SP), R9
   641  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   642  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   643  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   644  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   645  	RET
   646  
   647  // func asmcgocall(fn, arg unsafe.Pointer) int32
   648  // Call fn(arg) on the scheduler stack,
   649  // aligned appropriately for the gcc ABI.
   650  // See cgocall.go for more details.
   651  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   652  	MOVQ	fn+0(FP), AX
   653  	MOVQ	arg+8(FP), BX
   654  
   655  	MOVQ	SP, DX
   656  
   657  	// Figure out if we need to switch to m->g0 stack.
   658  	// We get called to create new OS threads too, and those
   659  	// come in on the m->g0 stack already.
   660  	get_tls(CX)
   661  	MOVQ	g(CX), R8
   662  	MOVQ	g_m(R8), R8
   663  	MOVQ	m_g0(R8), SI
   664  	MOVQ	g(CX), DI
   665  	CMPQ	SI, DI
   666  	JEQ	nosave
   667  	MOVQ	m_gsignal(R8), SI
   668  	CMPQ	SI, DI
   669  	JEQ	nosave
   670  	
   671  	MOVQ	m_g0(R8), SI
   672  	CALL	gosave<>(SB)
   673  	MOVQ	SI, g(CX)
   674  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   675  nosave:
   676  
   677  	// Now on a scheduling stack (a pthread-created stack).
   678  	// Make sure we have enough room for 4 stack-backed fast-call
   679  	// registers as per windows amd64 calling convention.
   680  	SUBQ	$64, SP
   681  	ANDQ	$~15, SP	// alignment for gcc ABI
   682  	MOVQ	DI, 48(SP)	// save g
   683  	MOVQ	(g_stack+stack_hi)(DI), DI
   684  	SUBQ	DX, DI
   685  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   686  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   687  	MOVQ	BX, CX		// CX = first argument in Win64
   688  	CALL	AX
   689  
   690  	// Restore registers, g, stack pointer.
   691  	get_tls(CX)
   692  	MOVQ	48(SP), DI
   693  	MOVQ	(g_stack+stack_hi)(DI), SI
   694  	SUBQ	40(SP), SI
   695  	MOVQ	DI, g(CX)
   696  	MOVQ	SI, SP
   697  
   698  	MOVL	AX, ret+16(FP)
   699  	RET
   700  
   701  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   702  // Turn the fn into a Go func (by taking its address) and call
   703  // cgocallback_gofunc.
   704  TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
   705  	LEAQ	fn+0(FP), AX
   706  	MOVQ	AX, 0(SP)
   707  	MOVQ	frame+8(FP), AX
   708  	MOVQ	AX, 8(SP)
   709  	MOVQ	framesize+16(FP), AX
   710  	MOVQ	AX, 16(SP)
   711  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   712  	CALL	AX
   713  	RET
   714  
   715  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   716  // See cgocall.go for more details.
   717  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
   718  	NO_LOCAL_POINTERS
   719  
   720  	// If g is nil, Go did not create the current thread.
   721  	// Call needm to obtain one m for temporary use.
   722  	// In this case, we're running on the thread stack, so there's
   723  	// lots of space, but the linker doesn't know. Hide the call from
   724  	// the linker analysis by using an indirect call through AX.
   725  	get_tls(CX)
   726  #ifdef GOOS_windows
   727  	MOVL	$0, BX
   728  	CMPQ	CX, $0
   729  	JEQ	2(PC)
   730  #endif
   731  	MOVQ	g(CX), BX
   732  	CMPQ	BX, $0
   733  	JEQ	needm
   734  	MOVQ	g_m(BX), BX
   735  	MOVQ	BX, R8 // holds oldm until end of function
   736  	JMP	havem
   737  needm:
   738  	MOVQ	$0, 0(SP)
   739  	MOVQ	$runtime·needm(SB), AX
   740  	CALL	AX
   741  	MOVQ	0(SP), R8
   742  	get_tls(CX)
   743  	MOVQ	g(CX), BX
   744  	MOVQ	g_m(BX), BX
   745  	
   746  	// Set m->sched.sp = SP, so that if a panic happens
   747  	// during the function we are about to execute, it will
   748  	// have a valid SP to run on the g0 stack.
   749  	// The next few lines (after the havem label)
   750  	// will save this SP onto the stack and then write
   751  	// the same SP back to m->sched.sp. That seems redundant,
   752  	// but if an unrecovered panic happens, unwindm will
   753  	// restore the g->sched.sp from the stack location
   754  	// and then systemstack will try to use it. If we don't set it here,
   755  	// that restored SP will be uninitialized (typically 0) and
   756  	// will not be usable.
   757  	MOVQ	m_g0(BX), SI
   758  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   759  
   760  havem:
   761  	// Now there's a valid m, and we're running on its m->g0.
   762  	// Save current m->g0->sched.sp on stack and then set it to SP.
   763  	// Save current sp in m->g0->sched.sp in preparation for
   764  	// switch back to m->curg stack.
   765  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   766  	MOVQ	m_g0(BX), SI
   767  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   768  	MOVQ	AX, 0(SP)
   769  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   770  
   771  	// Switch to m->curg stack and call runtime.cgocallbackg.
   772  	// Because we are taking over the execution of m->curg
   773  	// but *not* resuming what had been running, we need to
   774  	// save that information (m->curg->sched) so we can restore it.
   775  	// We can restore m->curg->sched.sp easily, because calling
   776  	// runtime.cgocallbackg leaves SP unchanged upon return.
   777  	// To save m->curg->sched.pc, we push it onto the stack.
   778  	// This has the added benefit that it looks to the traceback
   779  	// routine like cgocallbackg is going to return to that
   780  	// PC (because the frame we allocate below has the same
   781  	// size as cgocallback_gofunc's frame declared above)
   782  	// so that the traceback will seamlessly trace back into
   783  	// the earlier calls.
   784  	//
   785  	// In the new goroutine, 0(SP) holds the saved R8.
   786  	MOVQ	m_curg(BX), SI
   787  	MOVQ	SI, g(CX)
   788  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   789  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   790  	MOVQ	BX, -8(DI)
   791  	// Compute the size of the frame, including return PC and, if
   792  	// GOEXPERIMENT=framepointer, the saved based pointer
   793  	LEAQ	fv+0(FP), AX
   794  	SUBQ	SP, AX
   795  	SUBQ	AX, DI
   796  	MOVQ	DI, SP
   797  
   798  	MOVQ	R8, 0(SP)
   799  	CALL	runtime·cgocallbackg(SB)
   800  	MOVQ	0(SP), R8
   801  
   802  	// Compute the size of the frame again.  FP and SP have
   803  	// completely different values here than they did above,
   804  	// but only their difference matters.
   805  	LEAQ	fv+0(FP), AX
   806  	SUBQ	SP, AX
   807  
   808  	// Restore g->sched (== m->curg->sched) from saved values.
   809  	get_tls(CX)
   810  	MOVQ	g(CX), SI
   811  	MOVQ	SP, DI
   812  	ADDQ	AX, DI
   813  	MOVQ	-8(DI), BX
   814  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   815  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   816  
   817  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   818  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   819  	// so we do not have to restore it.)
   820  	MOVQ	g(CX), BX
   821  	MOVQ	g_m(BX), BX
   822  	MOVQ	m_g0(BX), SI
   823  	MOVQ	SI, g(CX)
   824  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   825  	MOVQ	0(SP), AX
   826  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   827  	
   828  	// If the m on entry was nil, we called needm above to borrow an m
   829  	// for the duration of the call. Since the call is over, return it with dropm.
   830  	CMPQ	R8, $0
   831  	JNE 3(PC)
   832  	MOVQ	$runtime·dropm(SB), AX
   833  	CALL	AX
   834  
   835  	// Done!
   836  	RET
   837  
   838  // void setg(G*); set g. for use by needm.
   839  TEXT runtime·setg(SB), NOSPLIT, $0-8
   840  	MOVQ	gg+0(FP), BX
   841  #ifdef GOOS_windows
   842  	CMPQ	BX, $0
   843  	JNE	settls
   844  	MOVQ	$0, 0x28(GS)
   845  	RET
   846  settls:
   847  	MOVQ	g_m(BX), AX
   848  	LEAQ	m_tls(AX), AX
   849  	MOVQ	AX, 0x28(GS)
   850  #endif
   851  	get_tls(CX)
   852  	MOVQ	BX, g(CX)
   853  	RET
   854  
   855  // void setg_gcc(G*); set g called from gcc.
   856  TEXT setg_gcc<>(SB),NOSPLIT,$0
   857  	get_tls(AX)
   858  	MOVQ	DI, g(AX)
   859  	RET
   860  
   861  // check that SP is in range [g->stack.lo, g->stack.hi)
   862  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   863  	get_tls(CX)
   864  	MOVQ	g(CX), AX
   865  	CMPQ	(g_stack+stack_hi)(AX), SP
   866  	JHI	2(PC)
   867  	INT	$3
   868  	CMPQ	SP, (g_stack+stack_lo)(AX)
   869  	JHI	2(PC)
   870  	INT	$3
   871  	RET
   872  
   873  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   874  	MOVQ	argp+0(FP),AX		// addr of first arg
   875  	MOVQ	-8(AX),AX		// get calling pc
   876  	CMPQ	AX, runtime·stackBarrierPC(SB)
   877  	JNE	nobar
   878  	// Get original return PC.
   879  	CALL	runtime·nextBarrierPC(SB)
   880  	MOVQ	0(SP), AX
   881  nobar:
   882  	MOVQ	AX, ret+8(FP)
   883  	RET
   884  
   885  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   886  	MOVQ	argp+0(FP),AX		// addr of first arg
   887  	MOVQ	pc+8(FP), BX
   888  	MOVQ	-8(AX), CX
   889  	CMPQ	CX, runtime·stackBarrierPC(SB)
   890  	JEQ	setbar
   891  	MOVQ	BX, -8(AX)		// set calling pc
   892  	RET
   893  setbar:
   894  	// Set the stack barrier return PC.
   895  	MOVQ	BX, 0(SP)
   896  	CALL	runtime·setNextBarrierPC(SB)
   897  	RET
   898  
   899  TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   900  	MOVQ	argp+0(FP), AX
   901  	MOVQ	AX, ret+8(FP)
   902  	RET
   903  
   904  // func cputicks() int64
   905  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   906  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   907  	JNE	mfence
   908  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   909  	JMP	done
   910  mfence:
   911  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   912  done:
   913  	RDTSC
   914  	SHLQ	$32, DX
   915  	ADDQ	DX, AX
   916  	MOVQ	AX, ret+0(FP)
   917  	RET
   918  
   919  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   920  // redirects to memhash(p, h, size) using the size
   921  // stored in the closure.
   922  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   923  	GO_ARGS
   924  	NO_LOCAL_POINTERS
   925  	MOVQ	p+0(FP), AX
   926  	MOVQ	h+8(FP), BX
   927  	MOVQ	8(DX), CX
   928  	MOVQ	AX, 0(SP)
   929  	MOVQ	BX, 8(SP)
   930  	MOVQ	CX, 16(SP)
   931  	CALL	runtime·memhash(SB)
   932  	MOVQ	24(SP), AX
   933  	MOVQ	AX, ret+16(FP)
   934  	RET
   935  
   936  // hash function using AES hardware instructions
   937  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   938  	MOVQ	p+0(FP), AX	// ptr to data
   939  	MOVQ	s+16(FP), CX	// size
   940  	LEAQ	ret+24(FP), DX
   941  	JMP	runtime·aeshashbody(SB)
   942  
   943  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   944  	MOVQ	p+0(FP), AX	// ptr to string struct
   945  	MOVQ	8(AX), CX	// length of string
   946  	MOVQ	(AX), AX	// string data
   947  	LEAQ	ret+16(FP), DX
   948  	JMP	runtime·aeshashbody(SB)
   949  
   950  // AX: data
   951  // CX: length
   952  // DX: address to put return value
   953  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   954  	// Fill an SSE register with our seeds.
   955  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   956  	PINSRW	$4, CX, X0			// 16 bits of length
   957  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   958  	MOVO	X0, X1				// save unscrambled seed
   959  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   960  	AESENC	X0, X0				// scramble seed
   961  
   962  	CMPQ	CX, $16
   963  	JB	aes0to15
   964  	JE	aes16
   965  	CMPQ	CX, $32
   966  	JBE	aes17to32
   967  	CMPQ	CX, $64
   968  	JBE	aes33to64
   969  	CMPQ	CX, $128
   970  	JBE	aes65to128
   971  	JMP	aes129plus
   972  
   973  aes0to15:
   974  	TESTQ	CX, CX
   975  	JE	aes0
   976  
   977  	ADDQ	$16, AX
   978  	TESTW	$0xff0, AX
   979  	JE	endofpage
   980  
   981  	// 16 bytes loaded at this address won't cross
   982  	// a page boundary, so we can load it directly.
   983  	MOVOU	-16(AX), X1
   984  	ADDQ	CX, CX
   985  	MOVQ	$masks<>(SB), AX
   986  	PAND	(AX)(CX*8), X1
   987  final1:
   988  	AESENC	X0, X1	// scramble input, xor in seed
   989  	AESENC	X1, X1  // scramble combo 2 times
   990  	AESENC	X1, X1
   991  	MOVQ	X1, (DX)
   992  	RET
   993  
   994  endofpage:
   995  	// address ends in 1111xxxx.  Might be up against
   996  	// a page boundary, so load ending at last byte.
   997  	// Then shift bytes down using pshufb.
   998  	MOVOU	-32(AX)(CX*1), X1
   999  	ADDQ	CX, CX
  1000  	MOVQ	$shifts<>(SB), AX
  1001  	PSHUFB	(AX)(CX*8), X1
  1002  	JMP	final1
  1003  
  1004  aes0:
  1005  	// Return scrambled input seed
  1006  	AESENC	X0, X0
  1007  	MOVQ	X0, (DX)
  1008  	RET
  1009  
  1010  aes16:
  1011  	MOVOU	(AX), X1
  1012  	JMP	final1
  1013  
  1014  aes17to32:
  1015  	// make second starting seed
  1016  	PXOR	runtime·aeskeysched+16(SB), X1
  1017  	AESENC	X1, X1
  1018  	
  1019  	// load data to be hashed
  1020  	MOVOU	(AX), X2
  1021  	MOVOU	-16(AX)(CX*1), X3
  1022  
  1023  	// scramble 3 times
  1024  	AESENC	X0, X2
  1025  	AESENC	X1, X3
  1026  	AESENC	X2, X2
  1027  	AESENC	X3, X3
  1028  	AESENC	X2, X2
  1029  	AESENC	X3, X3
  1030  
  1031  	// combine results
  1032  	PXOR	X3, X2
  1033  	MOVQ	X2, (DX)
  1034  	RET
  1035  
  1036  aes33to64:
  1037  	// make 3 more starting seeds
  1038  	MOVO	X1, X2
  1039  	MOVO	X1, X3
  1040  	PXOR	runtime·aeskeysched+16(SB), X1
  1041  	PXOR	runtime·aeskeysched+32(SB), X2
  1042  	PXOR	runtime·aeskeysched+48(SB), X3
  1043  	AESENC	X1, X1
  1044  	AESENC	X2, X2
  1045  	AESENC	X3, X3
  1046  	
  1047  	MOVOU	(AX), X4
  1048  	MOVOU	16(AX), X5
  1049  	MOVOU	-32(AX)(CX*1), X6
  1050  	MOVOU	-16(AX)(CX*1), X7
  1051  	
  1052  	AESENC	X0, X4
  1053  	AESENC	X1, X5
  1054  	AESENC	X2, X6
  1055  	AESENC	X3, X7
  1056  	
  1057  	AESENC	X4, X4
  1058  	AESENC	X5, X5
  1059  	AESENC	X6, X6
  1060  	AESENC	X7, X7
  1061  	
  1062  	AESENC	X4, X4
  1063  	AESENC	X5, X5
  1064  	AESENC	X6, X6
  1065  	AESENC	X7, X7
  1066  
  1067  	PXOR	X6, X4
  1068  	PXOR	X7, X5
  1069  	PXOR	X5, X4
  1070  	MOVQ	X4, (DX)
  1071  	RET
  1072  
  1073  aes65to128:
  1074  	// make 7 more starting seeds
  1075  	MOVO	X1, X2
  1076  	MOVO	X1, X3
  1077  	MOVO	X1, X4
  1078  	MOVO	X1, X5
  1079  	MOVO	X1, X6
  1080  	MOVO	X1, X7
  1081  	PXOR	runtime·aeskeysched+16(SB), X1
  1082  	PXOR	runtime·aeskeysched+32(SB), X2
  1083  	PXOR	runtime·aeskeysched+48(SB), X3
  1084  	PXOR	runtime·aeskeysched+64(SB), X4
  1085  	PXOR	runtime·aeskeysched+80(SB), X5
  1086  	PXOR	runtime·aeskeysched+96(SB), X6
  1087  	PXOR	runtime·aeskeysched+112(SB), X7
  1088  	AESENC	X1, X1
  1089  	AESENC	X2, X2
  1090  	AESENC	X3, X3
  1091  	AESENC	X4, X4
  1092  	AESENC	X5, X5
  1093  	AESENC	X6, X6
  1094  	AESENC	X7, X7
  1095  
  1096  	// load data
  1097  	MOVOU	(AX), X8
  1098  	MOVOU	16(AX), X9
  1099  	MOVOU	32(AX), X10
  1100  	MOVOU	48(AX), X11
  1101  	MOVOU	-64(AX)(CX*1), X12
  1102  	MOVOU	-48(AX)(CX*1), X13
  1103  	MOVOU	-32(AX)(CX*1), X14
  1104  	MOVOU	-16(AX)(CX*1), X15
  1105  
  1106  	// scramble data, xor in seed
  1107  	AESENC	X0, X8
  1108  	AESENC	X1, X9
  1109  	AESENC	X2, X10
  1110  	AESENC	X3, X11
  1111  	AESENC	X4, X12
  1112  	AESENC	X5, X13
  1113  	AESENC	X6, X14
  1114  	AESENC	X7, X15
  1115  
  1116  	// scramble twice
  1117  	AESENC	X8, X8
  1118  	AESENC	X9, X9
  1119  	AESENC	X10, X10
  1120  	AESENC	X11, X11
  1121  	AESENC	X12, X12
  1122  	AESENC	X13, X13
  1123  	AESENC	X14, X14
  1124  	AESENC	X15, X15
  1125  	
  1126  	AESENC	X8, X8
  1127  	AESENC	X9, X9
  1128  	AESENC	X10, X10
  1129  	AESENC	X11, X11
  1130  	AESENC	X12, X12
  1131  	AESENC	X13, X13
  1132  	AESENC	X14, X14
  1133  	AESENC	X15, X15
  1134  
  1135  	// combine results
  1136  	PXOR	X12, X8
  1137  	PXOR	X13, X9
  1138  	PXOR	X14, X10
  1139  	PXOR	X15, X11
  1140  	PXOR	X10, X8
  1141  	PXOR	X11, X9
  1142  	PXOR	X9, X8
  1143  	MOVQ	X8, (DX)
  1144  	RET
  1145  
  1146  aes129plus:
  1147  	// make 7 more starting seeds
  1148  	MOVO	X1, X2
  1149  	MOVO	X1, X3
  1150  	MOVO	X1, X4
  1151  	MOVO	X1, X5
  1152  	MOVO	X1, X6
  1153  	MOVO	X1, X7
  1154  	PXOR	runtime·aeskeysched+16(SB), X1
  1155  	PXOR	runtime·aeskeysched+32(SB), X2
  1156  	PXOR	runtime·aeskeysched+48(SB), X3
  1157  	PXOR	runtime·aeskeysched+64(SB), X4
  1158  	PXOR	runtime·aeskeysched+80(SB), X5
  1159  	PXOR	runtime·aeskeysched+96(SB), X6
  1160  	PXOR	runtime·aeskeysched+112(SB), X7
  1161  	AESENC	X1, X1
  1162  	AESENC	X2, X2
  1163  	AESENC	X3, X3
  1164  	AESENC	X4, X4
  1165  	AESENC	X5, X5
  1166  	AESENC	X6, X6
  1167  	AESENC	X7, X7
  1168  	
  1169  	// start with last (possibly overlapping) block
  1170  	MOVOU	-128(AX)(CX*1), X8
  1171  	MOVOU	-112(AX)(CX*1), X9
  1172  	MOVOU	-96(AX)(CX*1), X10
  1173  	MOVOU	-80(AX)(CX*1), X11
  1174  	MOVOU	-64(AX)(CX*1), X12
  1175  	MOVOU	-48(AX)(CX*1), X13
  1176  	MOVOU	-32(AX)(CX*1), X14
  1177  	MOVOU	-16(AX)(CX*1), X15
  1178  
  1179  	// scramble input once, xor in seed
  1180  	AESENC	X0, X8
  1181  	AESENC	X1, X9
  1182  	AESENC	X2, X10
  1183  	AESENC	X3, X11
  1184  	AESENC	X4, X12
  1185  	AESENC	X5, X13
  1186  	AESENC	X6, X14
  1187  	AESENC	X7, X15
  1188  	
  1189  	// compute number of remaining 128-byte blocks
  1190  	DECQ	CX
  1191  	SHRQ	$7, CX
  1192  	
  1193  aesloop:
  1194  	// scramble state, xor in a block
  1195  	MOVOU	(AX), X0
  1196  	MOVOU	16(AX), X1
  1197  	MOVOU	32(AX), X2
  1198  	MOVOU	48(AX), X3
  1199  	AESENC	X0, X8
  1200  	AESENC	X1, X9
  1201  	AESENC	X2, X10
  1202  	AESENC	X3, X11
  1203  	MOVOU	64(AX), X4
  1204  	MOVOU	80(AX), X5
  1205  	MOVOU	96(AX), X6
  1206  	MOVOU	112(AX), X7
  1207  	AESENC	X4, X12
  1208  	AESENC	X5, X13
  1209  	AESENC	X6, X14
  1210  	AESENC	X7, X15
  1211  
  1212  	// scramble state
  1213  	AESENC	X8, X8
  1214  	AESENC	X9, X9
  1215  	AESENC	X10, X10
  1216  	AESENC	X11, X11
  1217  	AESENC	X12, X12
  1218  	AESENC	X13, X13
  1219  	AESENC	X14, X14
  1220  	AESENC	X15, X15
  1221  
  1222  	ADDQ	$128, AX
  1223  	DECQ	CX
  1224  	JNE	aesloop
  1225  
  1226  	// 2 more scrambles to finish
  1227  	AESENC	X8, X8
  1228  	AESENC	X9, X9
  1229  	AESENC	X10, X10
  1230  	AESENC	X11, X11
  1231  	AESENC	X12, X12
  1232  	AESENC	X13, X13
  1233  	AESENC	X14, X14
  1234  	AESENC	X15, X15
  1235  	AESENC	X8, X8
  1236  	AESENC	X9, X9
  1237  	AESENC	X10, X10
  1238  	AESENC	X11, X11
  1239  	AESENC	X12, X12
  1240  	AESENC	X13, X13
  1241  	AESENC	X14, X14
  1242  	AESENC	X15, X15
  1243  
  1244  	PXOR	X12, X8
  1245  	PXOR	X13, X9
  1246  	PXOR	X14, X10
  1247  	PXOR	X15, X11
  1248  	PXOR	X10, X8
  1249  	PXOR	X11, X9
  1250  	PXOR	X9, X8
  1251  	MOVQ	X8, (DX)
  1252  	RET
  1253  	
  1254  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1255  	MOVQ	p+0(FP), AX	// ptr to data
  1256  	MOVQ	h+8(FP), X0	// seed
  1257  	PINSRD	$2, (AX), X0	// data
  1258  	AESENC	runtime·aeskeysched+0(SB), X0
  1259  	AESENC	runtime·aeskeysched+16(SB), X0
  1260  	AESENC	runtime·aeskeysched+32(SB), X0
  1261  	MOVQ	X0, ret+16(FP)
  1262  	RET
  1263  
  1264  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1265  	MOVQ	p+0(FP), AX	// ptr to data
  1266  	MOVQ	h+8(FP), X0	// seed
  1267  	PINSRQ	$1, (AX), X0	// data
  1268  	AESENC	runtime·aeskeysched+0(SB), X0
  1269  	AESENC	runtime·aeskeysched+16(SB), X0
  1270  	AESENC	runtime·aeskeysched+32(SB), X0
  1271  	MOVQ	X0, ret+16(FP)
  1272  	RET
  1273  
  1274  // simple mask to get rid of data in the high part of the register.
  1275  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1276  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1277  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1278  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1279  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1280  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1281  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1282  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1283  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1284  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1285  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1286  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1287  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1288  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1289  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1290  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1291  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1292  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1293  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1294  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1295  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1296  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1297  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1298  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1299  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1300  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1301  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1302  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1303  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1304  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1305  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1306  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1307  GLOBL masks<>(SB),RODATA,$256
  1308  
  1309  // these are arguments to pshufb.  They move data down from
  1310  // the high bytes of the register to the low bytes of the register.
  1311  // index is how many bytes to move.
  1312  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1313  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1314  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1315  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1316  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1317  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1318  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1319  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1320  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1321  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1322  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1323  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1324  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1325  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1326  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1327  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1328  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1329  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1330  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1331  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1332  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1333  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1334  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1335  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1336  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1337  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1338  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1339  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1340  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1341  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1342  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1343  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1344  GLOBL shifts<>(SB),RODATA,$256
  1345  
  1346  TEXT runtime·memeq(SB),NOSPLIT,$0-25
  1347  	MOVQ	a+0(FP), SI
  1348  	MOVQ	b+8(FP), DI
  1349  	MOVQ	size+16(FP), BX
  1350  	LEAQ	ret+24(FP), AX
  1351  	JMP	runtime·memeqbody(SB)
  1352  
  1353  // memequal_varlen(a, b unsafe.Pointer) bool
  1354  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1355  	MOVQ	a+0(FP), SI
  1356  	MOVQ	b+8(FP), DI
  1357  	CMPQ	SI, DI
  1358  	JEQ	eq
  1359  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1360  	LEAQ	ret+16(FP), AX
  1361  	JMP	runtime·memeqbody(SB)
  1362  eq:
  1363  	MOVB	$1, ret+16(FP)
  1364  	RET
  1365  
  1366  // eqstring tests whether two strings are equal.
  1367  // The compiler guarantees that strings passed
  1368  // to eqstring have equal length.
  1369  // See runtime_test.go:eqstring_generic for
  1370  // equivalent Go code.
  1371  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1372  	MOVQ	s1str+0(FP), SI
  1373  	MOVQ	s2str+16(FP), DI
  1374  	CMPQ	SI, DI
  1375  	JEQ	eq
  1376  	MOVQ	s1len+8(FP), BX
  1377  	LEAQ	v+32(FP), AX
  1378  	JMP	runtime·memeqbody(SB)
  1379  eq:
  1380  	MOVB	$1, v+32(FP)
  1381  	RET
  1382  
  1383  // a in SI
  1384  // b in DI
  1385  // count in BX
  1386  // address of result byte in AX
  1387  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1388  	CMPQ	BX, $8
  1389  	JB	small
  1390  	
  1391  	// 64 bytes at a time using xmm registers
  1392  hugeloop:
  1393  	CMPQ	BX, $64
  1394  	JB	bigloop
  1395  	MOVOU	(SI), X0
  1396  	MOVOU	(DI), X1
  1397  	MOVOU	16(SI), X2
  1398  	MOVOU	16(DI), X3
  1399  	MOVOU	32(SI), X4
  1400  	MOVOU	32(DI), X5
  1401  	MOVOU	48(SI), X6
  1402  	MOVOU	48(DI), X7
  1403  	PCMPEQB	X1, X0
  1404  	PCMPEQB	X3, X2
  1405  	PCMPEQB	X5, X4
  1406  	PCMPEQB	X7, X6
  1407  	PAND	X2, X0
  1408  	PAND	X6, X4
  1409  	PAND	X4, X0
  1410  	PMOVMSKB X0, DX
  1411  	ADDQ	$64, SI
  1412  	ADDQ	$64, DI
  1413  	SUBQ	$64, BX
  1414  	CMPL	DX, $0xffff
  1415  	JEQ	hugeloop
  1416  	MOVB	$0, (AX)
  1417  	RET
  1418  
  1419  	// 8 bytes at a time using 64-bit register
  1420  bigloop:
  1421  	CMPQ	BX, $8
  1422  	JBE	leftover
  1423  	MOVQ	(SI), CX
  1424  	MOVQ	(DI), DX
  1425  	ADDQ	$8, SI
  1426  	ADDQ	$8, DI
  1427  	SUBQ	$8, BX
  1428  	CMPQ	CX, DX
  1429  	JEQ	bigloop
  1430  	MOVB	$0, (AX)
  1431  	RET
  1432  
  1433  	// remaining 0-8 bytes
  1434  leftover:
  1435  	MOVQ	-8(SI)(BX*1), CX
  1436  	MOVQ	-8(DI)(BX*1), DX
  1437  	CMPQ	CX, DX
  1438  	SETEQ	(AX)
  1439  	RET
  1440  
  1441  small:
  1442  	CMPQ	BX, $0
  1443  	JEQ	equal
  1444  
  1445  	LEAQ	0(BX*8), CX
  1446  	NEGQ	CX
  1447  
  1448  	CMPB	SI, $0xf8
  1449  	JA	si_high
  1450  
  1451  	// load at SI won't cross a page boundary.
  1452  	MOVQ	(SI), SI
  1453  	JMP	si_finish
  1454  si_high:
  1455  	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
  1456  	MOVQ	-8(SI)(BX*1), SI
  1457  	SHRQ	CX, SI
  1458  si_finish:
  1459  
  1460  	// same for DI.
  1461  	CMPB	DI, $0xf8
  1462  	JA	di_high
  1463  	MOVQ	(DI), DI
  1464  	JMP	di_finish
  1465  di_high:
  1466  	MOVQ	-8(DI)(BX*1), DI
  1467  	SHRQ	CX, DI
  1468  di_finish:
  1469  
  1470  	SUBQ	SI, DI
  1471  	SHLQ	CX, DI
  1472  equal:
  1473  	SETEQ	(AX)
  1474  	RET
  1475  
  1476  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1477  	MOVQ	s1_base+0(FP), SI
  1478  	MOVQ	s1_len+8(FP), BX
  1479  	MOVQ	s2_base+16(FP), DI
  1480  	MOVQ	s2_len+24(FP), DX
  1481  	LEAQ	ret+32(FP), R9
  1482  	JMP	runtime·cmpbody(SB)
  1483  
  1484  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1485  	MOVQ	s1+0(FP), SI
  1486  	MOVQ	s1+8(FP), BX
  1487  	MOVQ	s2+24(FP), DI
  1488  	MOVQ	s2+32(FP), DX
  1489  	LEAQ	res+48(FP), R9
  1490  	JMP	runtime·cmpbody(SB)
  1491  
  1492  // input:
  1493  //   SI = a
  1494  //   DI = b
  1495  //   BX = alen
  1496  //   DX = blen
  1497  //   R9 = address of output word (stores -1/0/1 here)
  1498  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1499  	CMPQ	SI, DI
  1500  	JEQ	allsame
  1501  	CMPQ	BX, DX
  1502  	MOVQ	DX, R8
  1503  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1504  	CMPQ	R8, $8
  1505  	JB	small
  1506  
  1507  	CMPQ	R8, $63
  1508  	JA	big_loop
  1509  loop:
  1510  	CMPQ	R8, $16
  1511  	JBE	_0through16
  1512  	MOVOU	(SI), X0
  1513  	MOVOU	(DI), X1
  1514  	PCMPEQB X0, X1
  1515  	PMOVMSKB X1, AX
  1516  	XORQ	$0xffff, AX	// convert EQ to NE
  1517  	JNE	diff16	// branch if at least one byte is not equal
  1518  	ADDQ	$16, SI
  1519  	ADDQ	$16, DI
  1520  	SUBQ	$16, R8
  1521  	JMP	loop
  1522  	
  1523  diff64:
  1524  	ADDQ	$48, SI
  1525  	ADDQ	$48, DI
  1526  	JMP	diff16
  1527  diff48:
  1528  	ADDQ	$32, SI
  1529  	ADDQ	$32, DI
  1530  	JMP	diff16
  1531  diff32:
  1532  	ADDQ	$16, SI
  1533  	ADDQ	$16, DI
  1534  	// AX = bit mask of differences
  1535  diff16:
  1536  	BSFQ	AX, BX	// index of first byte that differs
  1537  	XORQ	AX, AX
  1538  	MOVB	(SI)(BX*1), CX
  1539  	CMPB	CX, (DI)(BX*1)
  1540  	SETHI	AX
  1541  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1542  	MOVQ	AX, (R9)
  1543  	RET
  1544  
  1545  	// 0 through 16 bytes left, alen>=8, blen>=8
  1546  _0through16:
  1547  	CMPQ	R8, $8
  1548  	JBE	_0through8
  1549  	MOVQ	(SI), AX
  1550  	MOVQ	(DI), CX
  1551  	CMPQ	AX, CX
  1552  	JNE	diff8
  1553  _0through8:
  1554  	MOVQ	-8(SI)(R8*1), AX
  1555  	MOVQ	-8(DI)(R8*1), CX
  1556  	CMPQ	AX, CX
  1557  	JEQ	allsame
  1558  
  1559  	// AX and CX contain parts of a and b that differ.
  1560  diff8:
  1561  	BSWAPQ	AX	// reverse order of bytes
  1562  	BSWAPQ	CX
  1563  	XORQ	AX, CX
  1564  	BSRQ	CX, CX	// index of highest bit difference
  1565  	SHRQ	CX, AX	// move a's bit to bottom
  1566  	ANDQ	$1, AX	// mask bit
  1567  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1568  	MOVQ	AX, (R9)
  1569  	RET
  1570  
  1571  	// 0-7 bytes in common
  1572  small:
  1573  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1574  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1575  	JEQ	allsame
  1576  
  1577  	// load bytes of a into high bytes of AX
  1578  	CMPB	SI, $0xf8
  1579  	JA	si_high
  1580  	MOVQ	(SI), SI
  1581  	JMP	si_finish
  1582  si_high:
  1583  	MOVQ	-8(SI)(R8*1), SI
  1584  	SHRQ	CX, SI
  1585  si_finish:
  1586  	SHLQ	CX, SI
  1587  
  1588  	// load bytes of b in to high bytes of BX
  1589  	CMPB	DI, $0xf8
  1590  	JA	di_high
  1591  	MOVQ	(DI), DI
  1592  	JMP	di_finish
  1593  di_high:
  1594  	MOVQ	-8(DI)(R8*1), DI
  1595  	SHRQ	CX, DI
  1596  di_finish:
  1597  	SHLQ	CX, DI
  1598  
  1599  	BSWAPQ	SI	// reverse order of bytes
  1600  	BSWAPQ	DI
  1601  	XORQ	SI, DI	// find bit differences
  1602  	JEQ	allsame
  1603  	BSRQ	DI, CX	// index of highest bit difference
  1604  	SHRQ	CX, SI	// move a's bit to bottom
  1605  	ANDQ	$1, SI	// mask bit
  1606  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1607  	MOVQ	AX, (R9)
  1608  	RET
  1609  
  1610  allsame:
  1611  	XORQ	AX, AX
  1612  	XORQ	CX, CX
  1613  	CMPQ	BX, DX
  1614  	SETGT	AX	// 1 if alen > blen
  1615  	SETEQ	CX	// 1 if alen == blen
  1616  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1617  	MOVQ	AX, (R9)
  1618  	RET
  1619  
  1620  	// this works for >= 64 bytes of data.
  1621  big_loop:
  1622  	MOVOU	(SI), X0
  1623  	MOVOU	(DI), X1
  1624  	PCMPEQB X0, X1
  1625  	PMOVMSKB X1, AX
  1626  	XORQ	$0xffff, AX
  1627  	JNE	diff16
  1628  
  1629  	MOVOU	16(SI), X0
  1630  	MOVOU	16(DI), X1
  1631  	PCMPEQB X0, X1
  1632  	PMOVMSKB X1, AX
  1633  	XORQ	$0xffff, AX
  1634  	JNE	diff32
  1635  
  1636  	MOVOU	32(SI), X0
  1637  	MOVOU	32(DI), X1
  1638  	PCMPEQB X0, X1
  1639  	PMOVMSKB X1, AX
  1640  	XORQ	$0xffff, AX
  1641  	JNE	diff48
  1642  
  1643  	MOVOU	48(SI), X0
  1644  	MOVOU	48(DI), X1
  1645  	PCMPEQB X0, X1
  1646  	PMOVMSKB X1, AX
  1647  	XORQ	$0xffff, AX
  1648  	JNE	diff64
  1649  
  1650  	ADDQ	$64, SI
  1651  	ADDQ	$64, DI
  1652  	SUBQ	$64, R8
  1653  	CMPQ	R8, $64
  1654  	JBE	loop
  1655  	JMP	big_loop
  1656  
  1657  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1658  	MOVQ s+0(FP), SI
  1659  	MOVQ s_len+8(FP), BX
  1660  	MOVB c+24(FP), AL
  1661  	LEAQ ret+32(FP), R8
  1662  	JMP  runtime·indexbytebody(SB)
  1663  
  1664  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1665  	MOVQ s+0(FP), SI
  1666  	MOVQ s_len+8(FP), BX
  1667  	MOVB c+16(FP), AL
  1668  	LEAQ ret+24(FP), R8
  1669  	JMP  runtime·indexbytebody(SB)
  1670  
  1671  // input:
  1672  //   SI: data
  1673  //   BX: data len
  1674  //   AL: byte sought
  1675  //   R8: address to put result
  1676  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1677  	MOVQ SI, DI
  1678  
  1679  	CMPQ BX, $16
  1680  	JLT small
  1681  
  1682  	// round up to first 16-byte boundary
  1683  	TESTQ $15, SI
  1684  	JZ aligned
  1685  	MOVQ SI, CX
  1686  	ANDQ $~15, CX
  1687  	ADDQ $16, CX
  1688  
  1689  	// search the beginning
  1690  	SUBQ SI, CX
  1691  	REPN; SCASB
  1692  	JZ success
  1693  
  1694  // DI is 16-byte aligned; get ready to search using SSE instructions
  1695  aligned:
  1696  	// round down to last 16-byte boundary
  1697  	MOVQ BX, R11
  1698  	ADDQ SI, R11
  1699  	ANDQ $~15, R11
  1700  
  1701  	// shuffle X0 around so that each byte contains c
  1702  	MOVD AX, X0
  1703  	PUNPCKLBW X0, X0
  1704  	PUNPCKLBW X0, X0
  1705  	PSHUFL $0, X0, X0
  1706  	JMP condition
  1707  
  1708  sse:
  1709  	// move the next 16-byte chunk of the buffer into X1
  1710  	MOVO (DI), X1
  1711  	// compare bytes in X0 to X1
  1712  	PCMPEQB X0, X1
  1713  	// take the top bit of each byte in X1 and put the result in DX
  1714  	PMOVMSKB X1, DX
  1715  	TESTL DX, DX
  1716  	JNZ ssesuccess
  1717  	ADDQ $16, DI
  1718  
  1719  condition:
  1720  	CMPQ DI, R11
  1721  	JLT sse
  1722  
  1723  	// search the end
  1724  	MOVQ SI, CX
  1725  	ADDQ BX, CX
  1726  	SUBQ R11, CX
  1727  	// if CX == 0, the zero flag will be set and we'll end up
  1728  	// returning a false success
  1729  	JZ failure
  1730  	REPN; SCASB
  1731  	JZ success
  1732  
  1733  failure:
  1734  	MOVQ $-1, (R8)
  1735  	RET
  1736  
  1737  // handle for lengths < 16
  1738  small:
  1739  	MOVQ BX, CX
  1740  	REPN; SCASB
  1741  	JZ success
  1742  	MOVQ $-1, (R8)
  1743  	RET
  1744  
  1745  // we've found the chunk containing the byte
  1746  // now just figure out which specific byte it is
  1747  ssesuccess:
  1748  	// get the index of the least significant set bit
  1749  	BSFW DX, DX
  1750  	SUBQ SI, DI
  1751  	ADDQ DI, DX
  1752  	MOVQ DX, (R8)
  1753  	RET
  1754  
  1755  success:
  1756  	SUBQ SI, DI
  1757  	SUBL $1, DI
  1758  	MOVQ DI, (R8)
  1759  	RET
  1760  
  1761  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  1762  	MOVQ	a_len+8(FP), BX
  1763  	MOVQ	b_len+32(FP), CX
  1764  	CMPQ	BX, CX
  1765  	JNE	eqret
  1766  	MOVQ	a+0(FP), SI
  1767  	MOVQ	b+24(FP), DI
  1768  	LEAQ	ret+48(FP), AX
  1769  	JMP	runtime·memeqbody(SB)
  1770  eqret:
  1771  	MOVB	$0, ret+48(FP)
  1772  	RET
  1773  
  1774  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1775  	get_tls(CX)
  1776  	MOVQ	g(CX), AX
  1777  	MOVQ	g_m(AX), AX
  1778  	MOVL	m_fastrand(AX), DX
  1779  	ADDL	DX, DX
  1780  	MOVL	DX, BX
  1781  	XORL	$0x88888eef, DX
  1782  	CMOVLMI	BX, DX
  1783  	MOVL	DX, m_fastrand(AX)
  1784  	MOVL	DX, ret+0(FP)
  1785  	RET
  1786  
  1787  TEXT runtime·return0(SB), NOSPLIT, $0
  1788  	MOVL	$0, AX
  1789  	RET
  1790  
  1791  
  1792  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1793  // Must obey the gcc calling convention.
  1794  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1795  	get_tls(CX)
  1796  	MOVQ	g(CX), AX
  1797  	MOVQ	g_m(AX), AX
  1798  	MOVQ	m_curg(AX), AX
  1799  	MOVQ	(g_stack+stack_hi)(AX), AX
  1800  	RET
  1801  
  1802  // The top-most function running on a goroutine
  1803  // returns to goexit+PCQuantum.
  1804  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1805  	BYTE	$0x90	// NOP
  1806  	CALL	runtime·goexit1(SB)	// does not return
  1807  	// traceback from goexit1 must hit code range of goexit
  1808  	BYTE	$0x90	// NOP
  1809  
  1810  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  1811  	MOVQ	addr+0(FP), AX
  1812  	PREFETCHT0	(AX)
  1813  	RET
  1814  
  1815  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  1816  	MOVQ	addr+0(FP), AX
  1817  	PREFETCHT1	(AX)
  1818  	RET
  1819  
  1820  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  1821  	MOVQ	addr+0(FP), AX
  1822  	PREFETCHT2	(AX)
  1823  	RET
  1824  
  1825  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  1826  	MOVQ	addr+0(FP), AX
  1827  	PREFETCHNTA	(AX)
  1828  	RET
  1829  
  1830  // This is called from .init_array and follows the platform, not Go, ABI.
  1831  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1832  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  1833  	MOVQ	runtime·lastmoduledatap(SB), AX
  1834  	MOVQ	DI, moduledata_next(AX)
  1835  	MOVQ	DI, runtime·lastmoduledatap(SB)
  1836  	POPQ	R15
  1837  	RET