github.com/q45/go@v0.0.0-20151101211701-a4fb8c13db3f/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	CMPQ	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  
    46  	MOVQ	$1, AX
    47  	CPUID
    48  	MOVL	CX, runtime·cpuid_ecx(SB)
    49  	MOVL	DX, runtime·cpuid_edx(SB)
    50  nocpuinfo:	
    51  	
    52  	// if there is an _cgo_init, call it.
    53  	MOVQ	_cgo_init(SB), AX
    54  	TESTQ	AX, AX
    55  	JZ	needtls
    56  	// g0 already in DI
    57  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    58  	MOVQ	$setg_gcc<>(SB), SI
    59  	CALL	AX
    60  
    61  	// update stackguard after _cgo_init
    62  	MOVQ	$runtime·g0(SB), CX
    63  	MOVQ	(g_stack+stack_lo)(CX), AX
    64  	ADDQ	$const__StackGuard, AX
    65  	MOVQ	AX, g_stackguard0(CX)
    66  	MOVQ	AX, g_stackguard1(CX)
    67  
    68  #ifndef GOOS_windows
    69  	JMP ok
    70  #endif
    71  needtls:
    72  #ifdef GOOS_plan9
    73  	// skip TLS setup on Plan 9
    74  	JMP ok
    75  #endif
    76  #ifdef GOOS_solaris
    77  	// skip TLS setup on Solaris
    78  	JMP ok
    79  #endif
    80  
    81  	LEAQ	runtime·tls0(SB), DI
    82  	CALL	runtime·settls(SB)
    83  
    84  	// store through it, to make sure it works
    85  	get_tls(BX)
    86  	MOVQ	$0x123, g(BX)
    87  	MOVQ	runtime·tls0(SB), AX
    88  	CMPQ	AX, $0x123
    89  	JEQ 2(PC)
    90  	MOVL	AX, 0	// abort
    91  ok:
    92  	// set the per-goroutine and per-mach "registers"
    93  	get_tls(BX)
    94  	LEAQ	runtime·g0(SB), CX
    95  	MOVQ	CX, g(BX)
    96  	LEAQ	runtime·m0(SB), AX
    97  
    98  	// save m->g0 = g0
    99  	MOVQ	CX, m_g0(AX)
   100  	// save m0 to g0->m
   101  	MOVQ	AX, g_m(CX)
   102  
   103  	CLD				// convention is D is always left cleared
   104  	CALL	runtime·check(SB)
   105  
   106  	MOVL	16(SP), AX		// copy argc
   107  	MOVL	AX, 0(SP)
   108  	MOVQ	24(SP), AX		// copy argv
   109  	MOVQ	AX, 8(SP)
   110  	CALL	runtime·args(SB)
   111  	CALL	runtime·osinit(SB)
   112  	CALL	runtime·schedinit(SB)
   113  
   114  	// create a new goroutine to start program
   115  	MOVQ	$runtime·mainPC(SB), AX		// entry
   116  	PUSHQ	AX
   117  	PUSHQ	$0			// arg size
   118  	CALL	runtime·newproc(SB)
   119  	POPQ	AX
   120  	POPQ	AX
   121  
   122  	// start this M
   123  	CALL	runtime·mstart(SB)
   124  
   125  	MOVL	$0xf1, 0xf1  // crash
   126  	RET
   127  
   128  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   129  GLOBL	runtime·mainPC(SB),RODATA,$8
   130  
   131  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   132  	BYTE	$0xcc
   133  	RET
   134  
   135  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   136  	// No per-thread init.
   137  	RET
   138  
   139  /*
   140   *  go-routine
   141   */
   142  
   143  // void gosave(Gobuf*)
   144  // save state in Gobuf; setjmp
   145  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   146  	MOVQ	buf+0(FP), AX		// gobuf
   147  	LEAQ	buf+0(FP), BX		// caller's SP
   148  	MOVQ	BX, gobuf_sp(AX)
   149  	MOVQ	0(SP), BX		// caller's PC
   150  	MOVQ	BX, gobuf_pc(AX)
   151  	MOVQ	$0, gobuf_ret(AX)
   152  	MOVQ	$0, gobuf_ctxt(AX)
   153  	MOVQ	BP, gobuf_bp(AX)
   154  	get_tls(CX)
   155  	MOVQ	g(CX), BX
   156  	MOVQ	BX, gobuf_g(AX)
   157  	RET
   158  
   159  // void gogo(Gobuf*)
   160  // restore state from Gobuf; longjmp
   161  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   162  	MOVQ	buf+0(FP), BX		// gobuf
   163  	MOVQ	gobuf_g(BX), DX
   164  	MOVQ	0(DX), CX		// make sure g != nil
   165  	get_tls(CX)
   166  	MOVQ	DX, g(CX)
   167  	MOVQ	gobuf_sp(BX), SP	// restore SP
   168  	MOVQ	gobuf_ret(BX), AX
   169  	MOVQ	gobuf_ctxt(BX), DX
   170  	MOVQ	gobuf_bp(BX), BP
   171  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   172  	MOVQ	$0, gobuf_ret(BX)
   173  	MOVQ	$0, gobuf_ctxt(BX)
   174  	MOVQ	$0, gobuf_bp(BX)
   175  	MOVQ	gobuf_pc(BX), BX
   176  	JMP	BX
   177  
   178  // func mcall(fn func(*g))
   179  // Switch to m->g0's stack, call fn(g).
   180  // Fn must never return.  It should gogo(&g->sched)
   181  // to keep running g.
   182  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   183  	MOVQ	fn+0(FP), DI
   184  	
   185  	get_tls(CX)
   186  	MOVQ	g(CX), AX	// save state in g->sched
   187  	MOVQ	0(SP), BX	// caller's PC
   188  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   189  	LEAQ	fn+0(FP), BX	// caller's SP
   190  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   191  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   192  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   193  
   194  	// switch to m->g0 & its stack, call fn
   195  	MOVQ	g(CX), BX
   196  	MOVQ	g_m(BX), BX
   197  	MOVQ	m_g0(BX), SI
   198  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   199  	JNE	3(PC)
   200  	MOVQ	$runtime·badmcall(SB), AX
   201  	JMP	AX
   202  	MOVQ	SI, g(CX)	// g = m->g0
   203  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   204  	PUSHQ	AX
   205  	MOVQ	DI, DX
   206  	MOVQ	0(DI), DI
   207  	CALL	DI
   208  	POPQ	AX
   209  	MOVQ	$runtime·badmcall2(SB), AX
   210  	JMP	AX
   211  	RET
   212  
   213  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   214  // of the G stack.  We need to distinguish the routine that
   215  // lives at the bottom of the G stack from the one that lives
   216  // at the top of the system stack because the one at the top of
   217  // the system stack terminates the stack walk (see topofstack()).
   218  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   219  	RET
   220  
   221  // func systemstack(fn func())
   222  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   223  	MOVQ	fn+0(FP), DI	// DI = fn
   224  	get_tls(CX)
   225  	MOVQ	g(CX), AX	// AX = g
   226  	MOVQ	g_m(AX), BX	// BX = m
   227  
   228  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   229  	CMPQ	AX, DX
   230  	JEQ	noswitch
   231  
   232  	MOVQ	m_g0(BX), DX	// DX = g0
   233  	CMPQ	AX, DX
   234  	JEQ	noswitch
   235  
   236  	MOVQ	m_curg(BX), R8
   237  	CMPQ	AX, R8
   238  	JEQ	switch
   239  	
   240  	// Bad: g is not gsignal, not g0, not curg. What is it?
   241  	MOVQ	$runtime·badsystemstack(SB), AX
   242  	CALL	AX
   243  
   244  switch:
   245  	// save our state in g->sched.  Pretend to
   246  	// be systemstack_switch if the G stack is scanned.
   247  	MOVQ	$runtime·systemstack_switch(SB), SI
   248  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   249  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   250  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   251  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   252  
   253  	// switch to g0
   254  	MOVQ	DX, g(CX)
   255  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   256  	// make it look like mstart called systemstack on g0, to stop traceback
   257  	SUBQ	$8, BX
   258  	MOVQ	$runtime·mstart(SB), DX
   259  	MOVQ	DX, 0(BX)
   260  	MOVQ	BX, SP
   261  
   262  	// call target function
   263  	MOVQ	DI, DX
   264  	MOVQ	0(DI), DI
   265  	CALL	DI
   266  
   267  	// switch back to g
   268  	get_tls(CX)
   269  	MOVQ	g(CX), AX
   270  	MOVQ	g_m(AX), BX
   271  	MOVQ	m_curg(BX), AX
   272  	MOVQ	AX, g(CX)
   273  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   274  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   275  	RET
   276  
   277  noswitch:
   278  	// already on m stack, just call directly
   279  	MOVQ	DI, DX
   280  	MOVQ	0(DI), DI
   281  	CALL	DI
   282  	RET
   283  
   284  /*
   285   * support for morestack
   286   */
   287  
   288  // Called during function prolog when more stack is needed.
   289  //
   290  // The traceback routines see morestack on a g0 as being
   291  // the top of a stack (for example, morestack calling newstack
   292  // calling the scheduler calling newm calling gc), so we must
   293  // record an argument size. For that purpose, it has no arguments.
   294  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   295  	// Cannot grow scheduler stack (m->g0).
   296  	get_tls(CX)
   297  	MOVQ	g(CX), BX
   298  	MOVQ	g_m(BX), BX
   299  	MOVQ	m_g0(BX), SI
   300  	CMPQ	g(CX), SI
   301  	JNE	2(PC)
   302  	INT	$3
   303  
   304  	// Cannot grow signal stack (m->gsignal).
   305  	MOVQ	m_gsignal(BX), SI
   306  	CMPQ	g(CX), SI
   307  	JNE	2(PC)
   308  	INT	$3
   309  
   310  	// Called from f.
   311  	// Set m->morebuf to f's caller.
   312  	MOVQ	8(SP), AX	// f's caller's PC
   313  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   314  	LEAQ	16(SP), AX	// f's caller's SP
   315  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   316  	get_tls(CX)
   317  	MOVQ	g(CX), SI
   318  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   319  
   320  	// Set g->sched to context in f.
   321  	MOVQ	0(SP), AX // f's PC
   322  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   323  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   324  	LEAQ	8(SP), AX // f's SP
   325  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   326  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   327  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   328  
   329  	// Call newstack on m->g0's stack.
   330  	MOVQ	m_g0(BX), BX
   331  	MOVQ	BX, g(CX)
   332  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   333  	CALL	runtime·newstack(SB)
   334  	MOVQ	$0, 0x1003	// crash if newstack returns
   335  	RET
   336  
   337  // morestack but not preserving ctxt.
   338  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   339  	MOVL	$0, DX
   340  	JMP	runtime·morestack(SB)
   341  
   342  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   343  	// We came here via a RET to an overwritten return PC.
   344  	// AX may be live. Other registers are available.
   345  
   346  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   347  	get_tls(CX)
   348  	MOVQ	g(CX), CX
   349  	MOVQ	(g_stkbar+slice_array)(CX), DX
   350  	MOVQ	g_stkbarPos(CX), BX
   351  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   352  	MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
   353  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   354  	// Assert that we're popping the right saved LR.
   355  	CMPQ	R8, SP
   356  	JNE	2(PC)
   357  	MOVL	$0, 0
   358  	// Record that this stack barrier was hit.
   359  	ADDQ	$1, g_stkbarPos(CX)
   360  	// Jump to the original return PC.
   361  	JMP	BX
   362  
   363  // reflectcall: call a function with the given argument list
   364  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   365  // we don't have variable-sized frames, so we use a small number
   366  // of constant-sized-frame functions to encode a few bits of size in the pc.
   367  // Caution: ugly multiline assembly macros in your future!
   368  
   369  #define DISPATCH(NAME,MAXSIZE)		\
   370  	CMPQ	CX, $MAXSIZE;		\
   371  	JA	3(PC);			\
   372  	MOVQ	$NAME(SB), AX;		\
   373  	JMP	AX
   374  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   375  
   376  TEXT reflect·call(SB), NOSPLIT, $0-0
   377  	JMP	·reflectcall(SB)
   378  
   379  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   380  	MOVLQZX argsize+24(FP), CX
   381  	// NOTE(rsc): No call16, because CALLFN needs four words
   382  	// of argument space to invoke callwritebarrier.
   383  	DISPATCH(runtime·call32, 32)
   384  	DISPATCH(runtime·call64, 64)
   385  	DISPATCH(runtime·call128, 128)
   386  	DISPATCH(runtime·call256, 256)
   387  	DISPATCH(runtime·call512, 512)
   388  	DISPATCH(runtime·call1024, 1024)
   389  	DISPATCH(runtime·call2048, 2048)
   390  	DISPATCH(runtime·call4096, 4096)
   391  	DISPATCH(runtime·call8192, 8192)
   392  	DISPATCH(runtime·call16384, 16384)
   393  	DISPATCH(runtime·call32768, 32768)
   394  	DISPATCH(runtime·call65536, 65536)
   395  	DISPATCH(runtime·call131072, 131072)
   396  	DISPATCH(runtime·call262144, 262144)
   397  	DISPATCH(runtime·call524288, 524288)
   398  	DISPATCH(runtime·call1048576, 1048576)
   399  	DISPATCH(runtime·call2097152, 2097152)
   400  	DISPATCH(runtime·call4194304, 4194304)
   401  	DISPATCH(runtime·call8388608, 8388608)
   402  	DISPATCH(runtime·call16777216, 16777216)
   403  	DISPATCH(runtime·call33554432, 33554432)
   404  	DISPATCH(runtime·call67108864, 67108864)
   405  	DISPATCH(runtime·call134217728, 134217728)
   406  	DISPATCH(runtime·call268435456, 268435456)
   407  	DISPATCH(runtime·call536870912, 536870912)
   408  	DISPATCH(runtime·call1073741824, 1073741824)
   409  	MOVQ	$runtime·badreflectcall(SB), AX
   410  	JMP	AX
   411  
   412  #define CALLFN(NAME,MAXSIZE)			\
   413  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   414  	NO_LOCAL_POINTERS;			\
   415  	/* copy arguments to stack */		\
   416  	MOVQ	argptr+16(FP), SI;		\
   417  	MOVLQZX argsize+24(FP), CX;		\
   418  	MOVQ	SP, DI;				\
   419  	REP;MOVSB;				\
   420  	/* call function */			\
   421  	MOVQ	f+8(FP), DX;			\
   422  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   423  	CALL	(DX);				\
   424  	/* copy return values back */		\
   425  	MOVQ	argptr+16(FP), DI;		\
   426  	MOVLQZX	argsize+24(FP), CX;		\
   427  	MOVLQZX retoffset+28(FP), BX;		\
   428  	MOVQ	SP, SI;				\
   429  	ADDQ	BX, DI;				\
   430  	ADDQ	BX, SI;				\
   431  	SUBQ	BX, CX;				\
   432  	REP;MOVSB;				\
   433  	/* execute write barrier updates */	\
   434  	MOVQ	argtype+0(FP), DX;		\
   435  	MOVQ	argptr+16(FP), DI;		\
   436  	MOVLQZX	argsize+24(FP), CX;		\
   437  	MOVLQZX retoffset+28(FP), BX;		\
   438  	MOVQ	DX, 0(SP);			\
   439  	MOVQ	DI, 8(SP);			\
   440  	MOVQ	CX, 16(SP);			\
   441  	MOVQ	BX, 24(SP);			\
   442  	CALL	runtime·callwritebarrier(SB);	\
   443  	RET
   444  
   445  CALLFN(·call32, 32)
   446  CALLFN(·call64, 64)
   447  CALLFN(·call128, 128)
   448  CALLFN(·call256, 256)
   449  CALLFN(·call512, 512)
   450  CALLFN(·call1024, 1024)
   451  CALLFN(·call2048, 2048)
   452  CALLFN(·call4096, 4096)
   453  CALLFN(·call8192, 8192)
   454  CALLFN(·call16384, 16384)
   455  CALLFN(·call32768, 32768)
   456  CALLFN(·call65536, 65536)
   457  CALLFN(·call131072, 131072)
   458  CALLFN(·call262144, 262144)
   459  CALLFN(·call524288, 524288)
   460  CALLFN(·call1048576, 1048576)
   461  CALLFN(·call2097152, 2097152)
   462  CALLFN(·call4194304, 4194304)
   463  CALLFN(·call8388608, 8388608)
   464  CALLFN(·call16777216, 16777216)
   465  CALLFN(·call33554432, 33554432)
   466  CALLFN(·call67108864, 67108864)
   467  CALLFN(·call134217728, 134217728)
   468  CALLFN(·call268435456, 268435456)
   469  CALLFN(·call536870912, 536870912)
   470  CALLFN(·call1073741824, 1073741824)
   471  
   472  // bool cas(int32 *val, int32 old, int32 new)
   473  // Atomically:
   474  //	if(*val == old){
   475  //		*val = new;
   476  //		return 1;
   477  //	} else
   478  //		return 0;
   479  TEXT runtime·cas(SB), NOSPLIT, $0-17
   480  	MOVQ	ptr+0(FP), BX
   481  	MOVL	old+8(FP), AX
   482  	MOVL	new+12(FP), CX
   483  	LOCK
   484  	CMPXCHGL	CX, 0(BX)
   485  	SETEQ	ret+16(FP)
   486  	RET
   487  
   488  // bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
   489  // Atomically:
   490  //	if(*val == *old){
   491  //		*val = new;
   492  //		return 1;
   493  //	} else {
   494  //		return 0;
   495  //	}
   496  TEXT runtime·cas64(SB), NOSPLIT, $0-25
   497  	MOVQ	ptr+0(FP), BX
   498  	MOVQ	old+8(FP), AX
   499  	MOVQ	new+16(FP), CX
   500  	LOCK
   501  	CMPXCHGQ	CX, 0(BX)
   502  	SETEQ	ret+24(FP)
   503  	RET
   504  	
   505  TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
   506  	JMP	runtime·cas64(SB)
   507  
   508  TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16
   509  	JMP	runtime·atomicload64(SB)
   510  
   511  TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16
   512  	JMP	runtime·atomicload64(SB)
   513  
   514  TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
   515  	JMP	runtime·atomicstore64(SB)
   516  
   517  // bool casp(void **val, void *old, void *new)
   518  // Atomically:
   519  //	if(*val == old){
   520  //		*val = new;
   521  //		return 1;
   522  //	} else
   523  //		return 0;
   524  TEXT runtime·casp1(SB), NOSPLIT, $0-25
   525  	MOVQ	ptr+0(FP), BX
   526  	MOVQ	old+8(FP), AX
   527  	MOVQ	new+16(FP), CX
   528  	LOCK
   529  	CMPXCHGQ	CX, 0(BX)
   530  	SETEQ	ret+24(FP)
   531  	RET
   532  
   533  // uint32 xadd(uint32 volatile *val, int32 delta)
   534  // Atomically:
   535  //	*val += delta;
   536  //	return *val;
   537  TEXT runtime·xadd(SB), NOSPLIT, $0-20
   538  	MOVQ	ptr+0(FP), BX
   539  	MOVL	delta+8(FP), AX
   540  	MOVL	AX, CX
   541  	LOCK
   542  	XADDL	AX, 0(BX)
   543  	ADDL	CX, AX
   544  	MOVL	AX, ret+16(FP)
   545  	RET
   546  
   547  TEXT runtime·xadd64(SB), NOSPLIT, $0-24
   548  	MOVQ	ptr+0(FP), BX
   549  	MOVQ	delta+8(FP), AX
   550  	MOVQ	AX, CX
   551  	LOCK
   552  	XADDQ	AX, 0(BX)
   553  	ADDQ	CX, AX
   554  	MOVQ	AX, ret+16(FP)
   555  	RET
   556  
   557  TEXT runtime·xadduintptr(SB), NOSPLIT, $0-24
   558  	JMP	runtime·xadd64(SB)
   559  
   560  TEXT runtime·xchg(SB), NOSPLIT, $0-20
   561  	MOVQ	ptr+0(FP), BX
   562  	MOVL	new+8(FP), AX
   563  	XCHGL	AX, 0(BX)
   564  	MOVL	AX, ret+16(FP)
   565  	RET
   566  
   567  TEXT runtime·xchg64(SB), NOSPLIT, $0-24
   568  	MOVQ	ptr+0(FP), BX
   569  	MOVQ	new+8(FP), AX
   570  	XCHGQ	AX, 0(BX)
   571  	MOVQ	AX, ret+16(FP)
   572  	RET
   573  
   574  TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
   575  	JMP	runtime·xchg64(SB)
   576  
   577  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   578  	MOVL	cycles+0(FP), AX
   579  again:
   580  	PAUSE
   581  	SUBL	$1, AX
   582  	JNZ	again
   583  	RET
   584  
   585  TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
   586  	MOVQ	ptr+0(FP), BX
   587  	MOVQ	val+8(FP), AX
   588  	XCHGQ	AX, 0(BX)
   589  	RET
   590  
   591  TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
   592  	MOVQ	ptr+0(FP), BX
   593  	MOVL	val+8(FP), AX
   594  	XCHGL	AX, 0(BX)
   595  	RET
   596  
   597  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
   598  	MOVQ	ptr+0(FP), BX
   599  	MOVQ	val+8(FP), AX
   600  	XCHGQ	AX, 0(BX)
   601  	RET
   602  
   603  // void	runtime·atomicor8(byte volatile*, byte);
   604  TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
   605  	MOVQ	ptr+0(FP), AX
   606  	MOVB	val+8(FP), BX
   607  	LOCK
   608  	ORB	BX, (AX)
   609  	RET
   610  
   611  // void	runtime·atomicand8(byte volatile*, byte);
   612  TEXT runtime·atomicand8(SB), NOSPLIT, $0-9
   613  	MOVQ	ptr+0(FP), AX
   614  	MOVB	val+8(FP), BX
   615  	LOCK
   616  	ANDB	BX, (AX)
   617  	RET
   618  
   619  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   620  	// Stores are already ordered on x86, so this is just a
   621  	// compile barrier.
   622  	RET
   623  
   624  // void jmpdefer(fn, sp);
   625  // called from deferreturn.
   626  // 1. pop the caller
   627  // 2. sub 5 bytes from the callers return
   628  // 3. jmp to the argument
   629  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   630  	MOVQ	fv+0(FP), DX	// fn
   631  	MOVQ	argp+8(FP), BX	// caller sp
   632  	LEAQ	-8(BX), SP	// caller sp after CALL
   633  	SUBQ	$5, (SP)	// return to CALL again
   634  	MOVQ	0(DX), BX
   635  	JMP	BX	// but first run the deferred function
   636  
   637  // Save state of caller into g->sched. Smashes R8, R9.
   638  TEXT gosave<>(SB),NOSPLIT,$0
   639  	get_tls(R8)
   640  	MOVQ	g(R8), R8
   641  	MOVQ	0(SP), R9
   642  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   643  	LEAQ	8(SP), R9
   644  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   645  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   646  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   647  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   648  	RET
   649  
   650  // func asmcgocall(fn, arg unsafe.Pointer) int32
   651  // Call fn(arg) on the scheduler stack,
   652  // aligned appropriately for the gcc ABI.
   653  // See cgocall.go for more details.
   654  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   655  	MOVQ	fn+0(FP), AX
   656  	MOVQ	arg+8(FP), BX
   657  
   658  	MOVQ	SP, DX
   659  
   660  	// Figure out if we need to switch to m->g0 stack.
   661  	// We get called to create new OS threads too, and those
   662  	// come in on the m->g0 stack already.
   663  	get_tls(CX)
   664  	MOVQ	g(CX), R8
   665  	MOVQ	g_m(R8), R8
   666  	MOVQ	m_g0(R8), SI
   667  	MOVQ	g(CX), DI
   668  	CMPQ	SI, DI
   669  	JEQ	nosave
   670  	MOVQ	m_gsignal(R8), SI
   671  	CMPQ	SI, DI
   672  	JEQ	nosave
   673  	
   674  	MOVQ	m_g0(R8), SI
   675  	CALL	gosave<>(SB)
   676  	MOVQ	SI, g(CX)
   677  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   678  nosave:
   679  
   680  	// Now on a scheduling stack (a pthread-created stack).
   681  	// Make sure we have enough room for 4 stack-backed fast-call
   682  	// registers as per windows amd64 calling convention.
   683  	SUBQ	$64, SP
   684  	ANDQ	$~15, SP	// alignment for gcc ABI
   685  	MOVQ	DI, 48(SP)	// save g
   686  	MOVQ	(g_stack+stack_hi)(DI), DI
   687  	SUBQ	DX, DI
   688  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   689  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   690  	MOVQ	BX, CX		// CX = first argument in Win64
   691  	CALL	AX
   692  
   693  	// Restore registers, g, stack pointer.
   694  	get_tls(CX)
   695  	MOVQ	48(SP), DI
   696  	MOVQ	(g_stack+stack_hi)(DI), SI
   697  	SUBQ	40(SP), SI
   698  	MOVQ	DI, g(CX)
   699  	MOVQ	SI, SP
   700  
   701  	MOVL	AX, ret+16(FP)
   702  	RET
   703  
   704  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   705  // Turn the fn into a Go func (by taking its address) and call
   706  // cgocallback_gofunc.
   707  TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
   708  	LEAQ	fn+0(FP), AX
   709  	MOVQ	AX, 0(SP)
   710  	MOVQ	frame+8(FP), AX
   711  	MOVQ	AX, 8(SP)
   712  	MOVQ	framesize+16(FP), AX
   713  	MOVQ	AX, 16(SP)
   714  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   715  	CALL	AX
   716  	RET
   717  
   718  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   719  // See cgocall.go for more details.
   720  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
   721  	NO_LOCAL_POINTERS
   722  
   723  	// If g is nil, Go did not create the current thread.
   724  	// Call needm to obtain one m for temporary use.
   725  	// In this case, we're running on the thread stack, so there's
   726  	// lots of space, but the linker doesn't know. Hide the call from
   727  	// the linker analysis by using an indirect call through AX.
   728  	get_tls(CX)
   729  #ifdef GOOS_windows
   730  	MOVL	$0, BX
   731  	CMPQ	CX, $0
   732  	JEQ	2(PC)
   733  #endif
   734  	MOVQ	g(CX), BX
   735  	CMPQ	BX, $0
   736  	JEQ	needm
   737  	MOVQ	g_m(BX), BX
   738  	MOVQ	BX, R8 // holds oldm until end of function
   739  	JMP	havem
   740  needm:
   741  	MOVQ	$0, 0(SP)
   742  	MOVQ	$runtime·needm(SB), AX
   743  	CALL	AX
   744  	MOVQ	0(SP), R8
   745  	get_tls(CX)
   746  	MOVQ	g(CX), BX
   747  	MOVQ	g_m(BX), BX
   748  	
   749  	// Set m->sched.sp = SP, so that if a panic happens
   750  	// during the function we are about to execute, it will
   751  	// have a valid SP to run on the g0 stack.
   752  	// The next few lines (after the havem label)
   753  	// will save this SP onto the stack and then write
   754  	// the same SP back to m->sched.sp. That seems redundant,
   755  	// but if an unrecovered panic happens, unwindm will
   756  	// restore the g->sched.sp from the stack location
   757  	// and then systemstack will try to use it. If we don't set it here,
   758  	// that restored SP will be uninitialized (typically 0) and
   759  	// will not be usable.
   760  	MOVQ	m_g0(BX), SI
   761  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   762  
   763  havem:
   764  	// Now there's a valid m, and we're running on its m->g0.
   765  	// Save current m->g0->sched.sp on stack and then set it to SP.
   766  	// Save current sp in m->g0->sched.sp in preparation for
   767  	// switch back to m->curg stack.
   768  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   769  	MOVQ	m_g0(BX), SI
   770  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   771  	MOVQ	AX, 0(SP)
   772  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   773  
   774  	// Switch to m->curg stack and call runtime.cgocallbackg.
   775  	// Because we are taking over the execution of m->curg
   776  	// but *not* resuming what had been running, we need to
   777  	// save that information (m->curg->sched) so we can restore it.
   778  	// We can restore m->curg->sched.sp easily, because calling
   779  	// runtime.cgocallbackg leaves SP unchanged upon return.
   780  	// To save m->curg->sched.pc, we push it onto the stack.
   781  	// This has the added benefit that it looks to the traceback
   782  	// routine like cgocallbackg is going to return to that
   783  	// PC (because the frame we allocate below has the same
   784  	// size as cgocallback_gofunc's frame declared above)
   785  	// so that the traceback will seamlessly trace back into
   786  	// the earlier calls.
   787  	//
   788  	// In the new goroutine, 0(SP) holds the saved R8.
   789  	MOVQ	m_curg(BX), SI
   790  	MOVQ	SI, g(CX)
   791  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   792  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   793  	MOVQ	BX, -8(DI)
   794  	// Compute the size of the frame, including return PC and, if
   795  	// GOEXPERIMENT=framepointer, the saved based pointer
   796  	LEAQ	fv+0(FP), AX
   797  	SUBQ	SP, AX
   798  	SUBQ	AX, DI
   799  	MOVQ	DI, SP
   800  
   801  	MOVQ	R8, 0(SP)
   802  	CALL	runtime·cgocallbackg(SB)
   803  	MOVQ	0(SP), R8
   804  
   805  	// Compute the size of the frame again.  FP and SP have
   806  	// completely different values here than they did above,
   807  	// but only their difference matters.
   808  	LEAQ	fv+0(FP), AX
   809  	SUBQ	SP, AX
   810  
   811  	// Restore g->sched (== m->curg->sched) from saved values.
   812  	get_tls(CX)
   813  	MOVQ	g(CX), SI
   814  	MOVQ	SP, DI
   815  	ADDQ	AX, DI
   816  	MOVQ	-8(DI), BX
   817  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   818  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   819  
   820  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   821  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   822  	// so we do not have to restore it.)
   823  	MOVQ	g(CX), BX
   824  	MOVQ	g_m(BX), BX
   825  	MOVQ	m_g0(BX), SI
   826  	MOVQ	SI, g(CX)
   827  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   828  	MOVQ	0(SP), AX
   829  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   830  	
   831  	// If the m on entry was nil, we called needm above to borrow an m
   832  	// for the duration of the call. Since the call is over, return it with dropm.
   833  	CMPQ	R8, $0
   834  	JNE 3(PC)
   835  	MOVQ	$runtime·dropm(SB), AX
   836  	CALL	AX
   837  
   838  	// Done!
   839  	RET
   840  
   841  // void setg(G*); set g. for use by needm.
   842  TEXT runtime·setg(SB), NOSPLIT, $0-8
   843  	MOVQ	gg+0(FP), BX
   844  #ifdef GOOS_windows
   845  	CMPQ	BX, $0
   846  	JNE	settls
   847  	MOVQ	$0, 0x28(GS)
   848  	RET
   849  settls:
   850  	MOVQ	g_m(BX), AX
   851  	LEAQ	m_tls(AX), AX
   852  	MOVQ	AX, 0x28(GS)
   853  #endif
   854  	get_tls(CX)
   855  	MOVQ	BX, g(CX)
   856  	RET
   857  
   858  // void setg_gcc(G*); set g called from gcc.
   859  TEXT setg_gcc<>(SB),NOSPLIT,$0
   860  	get_tls(AX)
   861  	MOVQ	DI, g(AX)
   862  	RET
   863  
   864  // check that SP is in range [g->stack.lo, g->stack.hi)
   865  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   866  	get_tls(CX)
   867  	MOVQ	g(CX), AX
   868  	CMPQ	(g_stack+stack_hi)(AX), SP
   869  	JHI	2(PC)
   870  	INT	$3
   871  	CMPQ	SP, (g_stack+stack_lo)(AX)
   872  	JHI	2(PC)
   873  	INT	$3
   874  	RET
   875  
   876  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   877  	MOVQ	argp+0(FP),AX		// addr of first arg
   878  	MOVQ	-8(AX),AX		// get calling pc
   879  	CMPQ	AX, runtime·stackBarrierPC(SB)
   880  	JNE	nobar
   881  	// Get original return PC.
   882  	CALL	runtime·nextBarrierPC(SB)
   883  	MOVQ	0(SP), AX
   884  nobar:
   885  	MOVQ	AX, ret+8(FP)
   886  	RET
   887  
   888  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   889  	MOVQ	argp+0(FP),AX		// addr of first arg
   890  	MOVQ	pc+8(FP), BX
   891  	MOVQ	-8(AX), CX
   892  	CMPQ	CX, runtime·stackBarrierPC(SB)
   893  	JEQ	setbar
   894  	MOVQ	BX, -8(AX)		// set calling pc
   895  	RET
   896  setbar:
   897  	// Set the stack barrier return PC.
   898  	MOVQ	BX, 0(SP)
   899  	CALL	runtime·setNextBarrierPC(SB)
   900  	RET
   901  
   902  TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   903  	MOVQ	argp+0(FP), AX
   904  	MOVQ	AX, ret+8(FP)
   905  	RET
   906  
   907  // func cputicks() int64
   908  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   909  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   910  	JNE	mfence
   911  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   912  	JMP	done
   913  mfence:
   914  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   915  done:
   916  	RDTSC
   917  	SHLQ	$32, DX
   918  	ADDQ	DX, AX
   919  	MOVQ	AX, ret+0(FP)
   920  	RET
   921  
   922  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   923  // redirects to memhash(p, h, size) using the size
   924  // stored in the closure.
   925  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   926  	GO_ARGS
   927  	NO_LOCAL_POINTERS
   928  	MOVQ	p+0(FP), AX
   929  	MOVQ	h+8(FP), BX
   930  	MOVQ	8(DX), CX
   931  	MOVQ	AX, 0(SP)
   932  	MOVQ	BX, 8(SP)
   933  	MOVQ	CX, 16(SP)
   934  	CALL	runtime·memhash(SB)
   935  	MOVQ	24(SP), AX
   936  	MOVQ	AX, ret+16(FP)
   937  	RET
   938  
   939  // hash function using AES hardware instructions
   940  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   941  	MOVQ	p+0(FP), AX	// ptr to data
   942  	MOVQ	s+16(FP), CX	// size
   943  	LEAQ	ret+24(FP), DX
   944  	JMP	runtime·aeshashbody(SB)
   945  
   946  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   947  	MOVQ	p+0(FP), AX	// ptr to string struct
   948  	MOVQ	8(AX), CX	// length of string
   949  	MOVQ	(AX), AX	// string data
   950  	LEAQ	ret+16(FP), DX
   951  	JMP	runtime·aeshashbody(SB)
   952  
   953  // AX: data
   954  // CX: length
   955  // DX: address to put return value
   956  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   957  	// Fill an SSE register with our seeds.
   958  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   959  	PINSRW	$4, CX, X0			// 16 bits of length
   960  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   961  	MOVO	X0, X1				// save unscrambled seed
   962  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   963  	AESENC	X0, X0				// scramble seed
   964  
   965  	CMPQ	CX, $16
   966  	JB	aes0to15
   967  	JE	aes16
   968  	CMPQ	CX, $32
   969  	JBE	aes17to32
   970  	CMPQ	CX, $64
   971  	JBE	aes33to64
   972  	CMPQ	CX, $128
   973  	JBE	aes65to128
   974  	JMP	aes129plus
   975  
   976  aes0to15:
   977  	TESTQ	CX, CX
   978  	JE	aes0
   979  
   980  	ADDQ	$16, AX
   981  	TESTW	$0xff0, AX
   982  	JE	endofpage
   983  
   984  	// 16 bytes loaded at this address won't cross
   985  	// a page boundary, so we can load it directly.
   986  	MOVOU	-16(AX), X1
   987  	ADDQ	CX, CX
   988  	MOVQ	$masks<>(SB), AX
   989  	PAND	(AX)(CX*8), X1
   990  final1:
   991  	AESENC	X0, X1	// scramble input, xor in seed
   992  	AESENC	X1, X1  // scramble combo 2 times
   993  	AESENC	X1, X1
   994  	MOVQ	X1, (DX)
   995  	RET
   996  
   997  endofpage:
   998  	// address ends in 1111xxxx.  Might be up against
   999  	// a page boundary, so load ending at last byte.
  1000  	// Then shift bytes down using pshufb.
  1001  	MOVOU	-32(AX)(CX*1), X1
  1002  	ADDQ	CX, CX
  1003  	MOVQ	$shifts<>(SB), AX
  1004  	PSHUFB	(AX)(CX*8), X1
  1005  	JMP	final1
  1006  
  1007  aes0:
  1008  	// Return scrambled input seed
  1009  	AESENC	X0, X0
  1010  	MOVQ	X0, (DX)
  1011  	RET
  1012  
  1013  aes16:
  1014  	MOVOU	(AX), X1
  1015  	JMP	final1
  1016  
  1017  aes17to32:
  1018  	// make second starting seed
  1019  	PXOR	runtime·aeskeysched+16(SB), X1
  1020  	AESENC	X1, X1
  1021  	
  1022  	// load data to be hashed
  1023  	MOVOU	(AX), X2
  1024  	MOVOU	-16(AX)(CX*1), X3
  1025  
  1026  	// scramble 3 times
  1027  	AESENC	X0, X2
  1028  	AESENC	X1, X3
  1029  	AESENC	X2, X2
  1030  	AESENC	X3, X3
  1031  	AESENC	X2, X2
  1032  	AESENC	X3, X3
  1033  
  1034  	// combine results
  1035  	PXOR	X3, X2
  1036  	MOVQ	X2, (DX)
  1037  	RET
  1038  
  1039  aes33to64:
  1040  	// make 3 more starting seeds
  1041  	MOVO	X1, X2
  1042  	MOVO	X1, X3
  1043  	PXOR	runtime·aeskeysched+16(SB), X1
  1044  	PXOR	runtime·aeskeysched+32(SB), X2
  1045  	PXOR	runtime·aeskeysched+48(SB), X3
  1046  	AESENC	X1, X1
  1047  	AESENC	X2, X2
  1048  	AESENC	X3, X3
  1049  	
  1050  	MOVOU	(AX), X4
  1051  	MOVOU	16(AX), X5
  1052  	MOVOU	-32(AX)(CX*1), X6
  1053  	MOVOU	-16(AX)(CX*1), X7
  1054  	
  1055  	AESENC	X0, X4
  1056  	AESENC	X1, X5
  1057  	AESENC	X2, X6
  1058  	AESENC	X3, X7
  1059  	
  1060  	AESENC	X4, X4
  1061  	AESENC	X5, X5
  1062  	AESENC	X6, X6
  1063  	AESENC	X7, X7
  1064  	
  1065  	AESENC	X4, X4
  1066  	AESENC	X5, X5
  1067  	AESENC	X6, X6
  1068  	AESENC	X7, X7
  1069  
  1070  	PXOR	X6, X4
  1071  	PXOR	X7, X5
  1072  	PXOR	X5, X4
  1073  	MOVQ	X4, (DX)
  1074  	RET
  1075  
  1076  aes65to128:
  1077  	// make 7 more starting seeds
  1078  	MOVO	X1, X2
  1079  	MOVO	X1, X3
  1080  	MOVO	X1, X4
  1081  	MOVO	X1, X5
  1082  	MOVO	X1, X6
  1083  	MOVO	X1, X7
  1084  	PXOR	runtime·aeskeysched+16(SB), X1
  1085  	PXOR	runtime·aeskeysched+32(SB), X2
  1086  	PXOR	runtime·aeskeysched+48(SB), X3
  1087  	PXOR	runtime·aeskeysched+64(SB), X4
  1088  	PXOR	runtime·aeskeysched+80(SB), X5
  1089  	PXOR	runtime·aeskeysched+96(SB), X6
  1090  	PXOR	runtime·aeskeysched+112(SB), X7
  1091  	AESENC	X1, X1
  1092  	AESENC	X2, X2
  1093  	AESENC	X3, X3
  1094  	AESENC	X4, X4
  1095  	AESENC	X5, X5
  1096  	AESENC	X6, X6
  1097  	AESENC	X7, X7
  1098  
  1099  	// load data
  1100  	MOVOU	(AX), X8
  1101  	MOVOU	16(AX), X9
  1102  	MOVOU	32(AX), X10
  1103  	MOVOU	48(AX), X11
  1104  	MOVOU	-64(AX)(CX*1), X12
  1105  	MOVOU	-48(AX)(CX*1), X13
  1106  	MOVOU	-32(AX)(CX*1), X14
  1107  	MOVOU	-16(AX)(CX*1), X15
  1108  
  1109  	// scramble data, xor in seed
  1110  	AESENC	X0, X8
  1111  	AESENC	X1, X9
  1112  	AESENC	X2, X10
  1113  	AESENC	X3, X11
  1114  	AESENC	X4, X12
  1115  	AESENC	X5, X13
  1116  	AESENC	X6, X14
  1117  	AESENC	X7, X15
  1118  
  1119  	// scramble twice
  1120  	AESENC	X8, X8
  1121  	AESENC	X9, X9
  1122  	AESENC	X10, X10
  1123  	AESENC	X11, X11
  1124  	AESENC	X12, X12
  1125  	AESENC	X13, X13
  1126  	AESENC	X14, X14
  1127  	AESENC	X15, X15
  1128  	
  1129  	AESENC	X8, X8
  1130  	AESENC	X9, X9
  1131  	AESENC	X10, X10
  1132  	AESENC	X11, X11
  1133  	AESENC	X12, X12
  1134  	AESENC	X13, X13
  1135  	AESENC	X14, X14
  1136  	AESENC	X15, X15
  1137  
  1138  	// combine results
  1139  	PXOR	X12, X8
  1140  	PXOR	X13, X9
  1141  	PXOR	X14, X10
  1142  	PXOR	X15, X11
  1143  	PXOR	X10, X8
  1144  	PXOR	X11, X9
  1145  	PXOR	X9, X8
  1146  	MOVQ	X8, (DX)
  1147  	RET
  1148  
  1149  aes129plus:
  1150  	// make 7 more starting seeds
  1151  	MOVO	X1, X2
  1152  	MOVO	X1, X3
  1153  	MOVO	X1, X4
  1154  	MOVO	X1, X5
  1155  	MOVO	X1, X6
  1156  	MOVO	X1, X7
  1157  	PXOR	runtime·aeskeysched+16(SB), X1
  1158  	PXOR	runtime·aeskeysched+32(SB), X2
  1159  	PXOR	runtime·aeskeysched+48(SB), X3
  1160  	PXOR	runtime·aeskeysched+64(SB), X4
  1161  	PXOR	runtime·aeskeysched+80(SB), X5
  1162  	PXOR	runtime·aeskeysched+96(SB), X6
  1163  	PXOR	runtime·aeskeysched+112(SB), X7
  1164  	AESENC	X1, X1
  1165  	AESENC	X2, X2
  1166  	AESENC	X3, X3
  1167  	AESENC	X4, X4
  1168  	AESENC	X5, X5
  1169  	AESENC	X6, X6
  1170  	AESENC	X7, X7
  1171  	
  1172  	// start with last (possibly overlapping) block
  1173  	MOVOU	-128(AX)(CX*1), X8
  1174  	MOVOU	-112(AX)(CX*1), X9
  1175  	MOVOU	-96(AX)(CX*1), X10
  1176  	MOVOU	-80(AX)(CX*1), X11
  1177  	MOVOU	-64(AX)(CX*1), X12
  1178  	MOVOU	-48(AX)(CX*1), X13
  1179  	MOVOU	-32(AX)(CX*1), X14
  1180  	MOVOU	-16(AX)(CX*1), X15
  1181  
  1182  	// scramble input once, xor in seed
  1183  	AESENC	X0, X8
  1184  	AESENC	X1, X9
  1185  	AESENC	X2, X10
  1186  	AESENC	X3, X11
  1187  	AESENC	X4, X12
  1188  	AESENC	X5, X13
  1189  	AESENC	X6, X14
  1190  	AESENC	X7, X15
  1191  	
  1192  	// compute number of remaining 128-byte blocks
  1193  	DECQ	CX
  1194  	SHRQ	$7, CX
  1195  	
  1196  aesloop:
  1197  	// scramble state, xor in a block
  1198  	MOVOU	(AX), X0
  1199  	MOVOU	16(AX), X1
  1200  	MOVOU	32(AX), X2
  1201  	MOVOU	48(AX), X3
  1202  	AESENC	X0, X8
  1203  	AESENC	X1, X9
  1204  	AESENC	X2, X10
  1205  	AESENC	X3, X11
  1206  	MOVOU	64(AX), X4
  1207  	MOVOU	80(AX), X5
  1208  	MOVOU	96(AX), X6
  1209  	MOVOU	112(AX), X7
  1210  	AESENC	X4, X12
  1211  	AESENC	X5, X13
  1212  	AESENC	X6, X14
  1213  	AESENC	X7, X15
  1214  
  1215  	// scramble state
  1216  	AESENC	X8, X8
  1217  	AESENC	X9, X9
  1218  	AESENC	X10, X10
  1219  	AESENC	X11, X11
  1220  	AESENC	X12, X12
  1221  	AESENC	X13, X13
  1222  	AESENC	X14, X14
  1223  	AESENC	X15, X15
  1224  
  1225  	ADDQ	$128, AX
  1226  	DECQ	CX
  1227  	JNE	aesloop
  1228  
  1229  	// 2 more scrambles to finish
  1230  	AESENC	X8, X8
  1231  	AESENC	X9, X9
  1232  	AESENC	X10, X10
  1233  	AESENC	X11, X11
  1234  	AESENC	X12, X12
  1235  	AESENC	X13, X13
  1236  	AESENC	X14, X14
  1237  	AESENC	X15, X15
  1238  	AESENC	X8, X8
  1239  	AESENC	X9, X9
  1240  	AESENC	X10, X10
  1241  	AESENC	X11, X11
  1242  	AESENC	X12, X12
  1243  	AESENC	X13, X13
  1244  	AESENC	X14, X14
  1245  	AESENC	X15, X15
  1246  
  1247  	PXOR	X12, X8
  1248  	PXOR	X13, X9
  1249  	PXOR	X14, X10
  1250  	PXOR	X15, X11
  1251  	PXOR	X10, X8
  1252  	PXOR	X11, X9
  1253  	PXOR	X9, X8
  1254  	MOVQ	X8, (DX)
  1255  	RET
  1256  	
  1257  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1258  	MOVQ	p+0(FP), AX	// ptr to data
  1259  	MOVQ	h+8(FP), X0	// seed
  1260  	PINSRD	$2, (AX), X0	// data
  1261  	AESENC	runtime·aeskeysched+0(SB), X0
  1262  	AESENC	runtime·aeskeysched+16(SB), X0
  1263  	AESENC	runtime·aeskeysched+32(SB), X0
  1264  	MOVQ	X0, ret+16(FP)
  1265  	RET
  1266  
  1267  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1268  	MOVQ	p+0(FP), AX	// ptr to data
  1269  	MOVQ	h+8(FP), X0	// seed
  1270  	PINSRQ	$1, (AX), X0	// data
  1271  	AESENC	runtime·aeskeysched+0(SB), X0
  1272  	AESENC	runtime·aeskeysched+16(SB), X0
  1273  	AESENC	runtime·aeskeysched+32(SB), X0
  1274  	MOVQ	X0, ret+16(FP)
  1275  	RET
  1276  
  1277  // simple mask to get rid of data in the high part of the register.
  1278  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1279  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1280  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1281  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1282  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1283  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1284  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1285  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1286  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1287  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1288  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1289  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1290  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1291  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1292  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1293  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1294  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1295  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1296  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1297  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1298  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1299  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1300  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1301  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1302  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1303  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1304  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1305  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1306  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1307  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1308  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1309  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1310  GLOBL masks<>(SB),RODATA,$256
  1311  
  1312  // these are arguments to pshufb.  They move data down from
  1313  // the high bytes of the register to the low bytes of the register.
  1314  // index is how many bytes to move.
  1315  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1316  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1317  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1318  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1319  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1320  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1321  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1322  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1323  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1324  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1325  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1326  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1327  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1328  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1329  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1330  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1331  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1332  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1333  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1334  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1335  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1336  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1337  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1338  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1339  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1340  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1341  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1342  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1343  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1344  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1345  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1346  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1347  GLOBL shifts<>(SB),RODATA,$256
  1348  
  1349  TEXT runtime·memeq(SB),NOSPLIT,$0-25
  1350  	MOVQ	a+0(FP), SI
  1351  	MOVQ	b+8(FP), DI
  1352  	MOVQ	size+16(FP), BX
  1353  	LEAQ	ret+24(FP), AX
  1354  	JMP	runtime·memeqbody(SB)
  1355  
  1356  // memequal_varlen(a, b unsafe.Pointer) bool
  1357  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1358  	MOVQ	a+0(FP), SI
  1359  	MOVQ	b+8(FP), DI
  1360  	CMPQ	SI, DI
  1361  	JEQ	eq
  1362  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1363  	LEAQ	ret+16(FP), AX
  1364  	JMP	runtime·memeqbody(SB)
  1365  eq:
  1366  	MOVB	$1, ret+16(FP)
  1367  	RET
  1368  
  1369  // eqstring tests whether two strings are equal.
  1370  // The compiler guarantees that strings passed
  1371  // to eqstring have equal length.
  1372  // See runtime_test.go:eqstring_generic for
  1373  // equivalent Go code.
  1374  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1375  	MOVQ	s1str+0(FP), SI
  1376  	MOVQ	s2str+16(FP), DI
  1377  	CMPQ	SI, DI
  1378  	JEQ	eq
  1379  	MOVQ	s1len+8(FP), BX
  1380  	LEAQ	v+32(FP), AX
  1381  	JMP	runtime·memeqbody(SB)
  1382  eq:
  1383  	MOVB	$1, v+32(FP)
  1384  	RET
  1385  
  1386  // a in SI
  1387  // b in DI
  1388  // count in BX
  1389  // address of result byte in AX
  1390  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1391  	CMPQ	BX, $8
  1392  	JB	small
  1393  	
  1394  	// 64 bytes at a time using xmm registers
  1395  hugeloop:
  1396  	CMPQ	BX, $64
  1397  	JB	bigloop
  1398  	MOVOU	(SI), X0
  1399  	MOVOU	(DI), X1
  1400  	MOVOU	16(SI), X2
  1401  	MOVOU	16(DI), X3
  1402  	MOVOU	32(SI), X4
  1403  	MOVOU	32(DI), X5
  1404  	MOVOU	48(SI), X6
  1405  	MOVOU	48(DI), X7
  1406  	PCMPEQB	X1, X0
  1407  	PCMPEQB	X3, X2
  1408  	PCMPEQB	X5, X4
  1409  	PCMPEQB	X7, X6
  1410  	PAND	X2, X0
  1411  	PAND	X6, X4
  1412  	PAND	X4, X0
  1413  	PMOVMSKB X0, DX
  1414  	ADDQ	$64, SI
  1415  	ADDQ	$64, DI
  1416  	SUBQ	$64, BX
  1417  	CMPL	DX, $0xffff
  1418  	JEQ	hugeloop
  1419  	MOVB	$0, (AX)
  1420  	RET
  1421  
  1422  	// 8 bytes at a time using 64-bit register
  1423  bigloop:
  1424  	CMPQ	BX, $8
  1425  	JBE	leftover
  1426  	MOVQ	(SI), CX
  1427  	MOVQ	(DI), DX
  1428  	ADDQ	$8, SI
  1429  	ADDQ	$8, DI
  1430  	SUBQ	$8, BX
  1431  	CMPQ	CX, DX
  1432  	JEQ	bigloop
  1433  	MOVB	$0, (AX)
  1434  	RET
  1435  
  1436  	// remaining 0-8 bytes
  1437  leftover:
  1438  	MOVQ	-8(SI)(BX*1), CX
  1439  	MOVQ	-8(DI)(BX*1), DX
  1440  	CMPQ	CX, DX
  1441  	SETEQ	(AX)
  1442  	RET
  1443  
  1444  small:
  1445  	CMPQ	BX, $0
  1446  	JEQ	equal
  1447  
  1448  	LEAQ	0(BX*8), CX
  1449  	NEGQ	CX
  1450  
  1451  	CMPB	SI, $0xf8
  1452  	JA	si_high
  1453  
  1454  	// load at SI won't cross a page boundary.
  1455  	MOVQ	(SI), SI
  1456  	JMP	si_finish
  1457  si_high:
  1458  	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
  1459  	MOVQ	-8(SI)(BX*1), SI
  1460  	SHRQ	CX, SI
  1461  si_finish:
  1462  
  1463  	// same for DI.
  1464  	CMPB	DI, $0xf8
  1465  	JA	di_high
  1466  	MOVQ	(DI), DI
  1467  	JMP	di_finish
  1468  di_high:
  1469  	MOVQ	-8(DI)(BX*1), DI
  1470  	SHRQ	CX, DI
  1471  di_finish:
  1472  
  1473  	SUBQ	SI, DI
  1474  	SHLQ	CX, DI
  1475  equal:
  1476  	SETEQ	(AX)
  1477  	RET
  1478  
  1479  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1480  	MOVQ	s1_base+0(FP), SI
  1481  	MOVQ	s1_len+8(FP), BX
  1482  	MOVQ	s2_base+16(FP), DI
  1483  	MOVQ	s2_len+24(FP), DX
  1484  	LEAQ	ret+32(FP), R9
  1485  	JMP	runtime·cmpbody(SB)
  1486  
  1487  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1488  	MOVQ	s1+0(FP), SI
  1489  	MOVQ	s1+8(FP), BX
  1490  	MOVQ	s2+24(FP), DI
  1491  	MOVQ	s2+32(FP), DX
  1492  	LEAQ	res+48(FP), R9
  1493  	JMP	runtime·cmpbody(SB)
  1494  
  1495  // input:
  1496  //   SI = a
  1497  //   DI = b
  1498  //   BX = alen
  1499  //   DX = blen
  1500  //   R9 = address of output word (stores -1/0/1 here)
  1501  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1502  	CMPQ	SI, DI
  1503  	JEQ	allsame
  1504  	CMPQ	BX, DX
  1505  	MOVQ	DX, R8
  1506  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1507  	CMPQ	R8, $8
  1508  	JB	small
  1509  
  1510  	CMPQ	R8, $63
  1511  	JA	big_loop
  1512  loop:
  1513  	CMPQ	R8, $16
  1514  	JBE	_0through16
  1515  	MOVOU	(SI), X0
  1516  	MOVOU	(DI), X1
  1517  	PCMPEQB X0, X1
  1518  	PMOVMSKB X1, AX
  1519  	XORQ	$0xffff, AX	// convert EQ to NE
  1520  	JNE	diff16	// branch if at least one byte is not equal
  1521  	ADDQ	$16, SI
  1522  	ADDQ	$16, DI
  1523  	SUBQ	$16, R8
  1524  	JMP	loop
  1525  	
  1526  diff64:
  1527  	ADDQ	$48, SI
  1528  	ADDQ	$48, DI
  1529  	JMP	diff16
  1530  diff48:
  1531  	ADDQ	$32, SI
  1532  	ADDQ	$32, DI
  1533  	JMP	diff16
  1534  diff32:
  1535  	ADDQ	$16, SI
  1536  	ADDQ	$16, DI
  1537  	// AX = bit mask of differences
  1538  diff16:
  1539  	BSFQ	AX, BX	// index of first byte that differs
  1540  	XORQ	AX, AX
  1541  	MOVB	(SI)(BX*1), CX
  1542  	CMPB	CX, (DI)(BX*1)
  1543  	SETHI	AX
  1544  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1545  	MOVQ	AX, (R9)
  1546  	RET
  1547  
  1548  	// 0 through 16 bytes left, alen>=8, blen>=8
  1549  _0through16:
  1550  	CMPQ	R8, $8
  1551  	JBE	_0through8
  1552  	MOVQ	(SI), AX
  1553  	MOVQ	(DI), CX
  1554  	CMPQ	AX, CX
  1555  	JNE	diff8
  1556  _0through8:
  1557  	MOVQ	-8(SI)(R8*1), AX
  1558  	MOVQ	-8(DI)(R8*1), CX
  1559  	CMPQ	AX, CX
  1560  	JEQ	allsame
  1561  
  1562  	// AX and CX contain parts of a and b that differ.
  1563  diff8:
  1564  	BSWAPQ	AX	// reverse order of bytes
  1565  	BSWAPQ	CX
  1566  	XORQ	AX, CX
  1567  	BSRQ	CX, CX	// index of highest bit difference
  1568  	SHRQ	CX, AX	// move a's bit to bottom
  1569  	ANDQ	$1, AX	// mask bit
  1570  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1571  	MOVQ	AX, (R9)
  1572  	RET
  1573  
  1574  	// 0-7 bytes in common
  1575  small:
  1576  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1577  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1578  	JEQ	allsame
  1579  
  1580  	// load bytes of a into high bytes of AX
  1581  	CMPB	SI, $0xf8
  1582  	JA	si_high
  1583  	MOVQ	(SI), SI
  1584  	JMP	si_finish
  1585  si_high:
  1586  	MOVQ	-8(SI)(R8*1), SI
  1587  	SHRQ	CX, SI
  1588  si_finish:
  1589  	SHLQ	CX, SI
  1590  
  1591  	// load bytes of b in to high bytes of BX
  1592  	CMPB	DI, $0xf8
  1593  	JA	di_high
  1594  	MOVQ	(DI), DI
  1595  	JMP	di_finish
  1596  di_high:
  1597  	MOVQ	-8(DI)(R8*1), DI
  1598  	SHRQ	CX, DI
  1599  di_finish:
  1600  	SHLQ	CX, DI
  1601  
  1602  	BSWAPQ	SI	// reverse order of bytes
  1603  	BSWAPQ	DI
  1604  	XORQ	SI, DI	// find bit differences
  1605  	JEQ	allsame
  1606  	BSRQ	DI, CX	// index of highest bit difference
  1607  	SHRQ	CX, SI	// move a's bit to bottom
  1608  	ANDQ	$1, SI	// mask bit
  1609  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1610  	MOVQ	AX, (R9)
  1611  	RET
  1612  
  1613  allsame:
  1614  	XORQ	AX, AX
  1615  	XORQ	CX, CX
  1616  	CMPQ	BX, DX
  1617  	SETGT	AX	// 1 if alen > blen
  1618  	SETEQ	CX	// 1 if alen == blen
  1619  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1620  	MOVQ	AX, (R9)
  1621  	RET
  1622  
  1623  	// this works for >= 64 bytes of data.
  1624  big_loop:
  1625  	MOVOU	(SI), X0
  1626  	MOVOU	(DI), X1
  1627  	PCMPEQB X0, X1
  1628  	PMOVMSKB X1, AX
  1629  	XORQ	$0xffff, AX
  1630  	JNE	diff16
  1631  
  1632  	MOVOU	16(SI), X0
  1633  	MOVOU	16(DI), X1
  1634  	PCMPEQB X0, X1
  1635  	PMOVMSKB X1, AX
  1636  	XORQ	$0xffff, AX
  1637  	JNE	diff32
  1638  
  1639  	MOVOU	32(SI), X0
  1640  	MOVOU	32(DI), X1
  1641  	PCMPEQB X0, X1
  1642  	PMOVMSKB X1, AX
  1643  	XORQ	$0xffff, AX
  1644  	JNE	diff48
  1645  
  1646  	MOVOU	48(SI), X0
  1647  	MOVOU	48(DI), X1
  1648  	PCMPEQB X0, X1
  1649  	PMOVMSKB X1, AX
  1650  	XORQ	$0xffff, AX
  1651  	JNE	diff64
  1652  
  1653  	ADDQ	$64, SI
  1654  	ADDQ	$64, DI
  1655  	SUBQ	$64, R8
  1656  	CMPQ	R8, $64
  1657  	JBE	loop
  1658  	JMP	big_loop
  1659  
  1660  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1661  	MOVQ s+0(FP), SI
  1662  	MOVQ s_len+8(FP), BX
  1663  	MOVB c+24(FP), AL
  1664  	LEAQ ret+32(FP), R8
  1665  	JMP  runtime·indexbytebody(SB)
  1666  
  1667  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1668  	MOVQ s+0(FP), SI
  1669  	MOVQ s_len+8(FP), BX
  1670  	MOVB c+16(FP), AL
  1671  	LEAQ ret+24(FP), R8
  1672  	JMP  runtime·indexbytebody(SB)
  1673  
  1674  // input:
  1675  //   SI: data
  1676  //   BX: data len
  1677  //   AL: byte sought
  1678  //   R8: address to put result
  1679  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1680  	MOVQ SI, DI
  1681  
  1682  	CMPQ BX, $16
  1683  	JLT small
  1684  
  1685  	// round up to first 16-byte boundary
  1686  	TESTQ $15, SI
  1687  	JZ aligned
  1688  	MOVQ SI, CX
  1689  	ANDQ $~15, CX
  1690  	ADDQ $16, CX
  1691  
  1692  	// search the beginning
  1693  	SUBQ SI, CX
  1694  	REPN; SCASB
  1695  	JZ success
  1696  
  1697  // DI is 16-byte aligned; get ready to search using SSE instructions
  1698  aligned:
  1699  	// round down to last 16-byte boundary
  1700  	MOVQ BX, R11
  1701  	ADDQ SI, R11
  1702  	ANDQ $~15, R11
  1703  
  1704  	// shuffle X0 around so that each byte contains c
  1705  	MOVD AX, X0
  1706  	PUNPCKLBW X0, X0
  1707  	PUNPCKLBW X0, X0
  1708  	PSHUFL $0, X0, X0
  1709  	JMP condition
  1710  
  1711  sse:
  1712  	// move the next 16-byte chunk of the buffer into X1
  1713  	MOVO (DI), X1
  1714  	// compare bytes in X0 to X1
  1715  	PCMPEQB X0, X1
  1716  	// take the top bit of each byte in X1 and put the result in DX
  1717  	PMOVMSKB X1, DX
  1718  	TESTL DX, DX
  1719  	JNZ ssesuccess
  1720  	ADDQ $16, DI
  1721  
  1722  condition:
  1723  	CMPQ DI, R11
  1724  	JLT sse
  1725  
  1726  	// search the end
  1727  	MOVQ SI, CX
  1728  	ADDQ BX, CX
  1729  	SUBQ R11, CX
  1730  	// if CX == 0, the zero flag will be set and we'll end up
  1731  	// returning a false success
  1732  	JZ failure
  1733  	REPN; SCASB
  1734  	JZ success
  1735  
  1736  failure:
  1737  	MOVQ $-1, (R8)
  1738  	RET
  1739  
  1740  // handle for lengths < 16
  1741  small:
  1742  	MOVQ BX, CX
  1743  	REPN; SCASB
  1744  	JZ success
  1745  	MOVQ $-1, (R8)
  1746  	RET
  1747  
  1748  // we've found the chunk containing the byte
  1749  // now just figure out which specific byte it is
  1750  ssesuccess:
  1751  	// get the index of the least significant set bit
  1752  	BSFW DX, DX
  1753  	SUBQ SI, DI
  1754  	ADDQ DI, DX
  1755  	MOVQ DX, (R8)
  1756  	RET
  1757  
  1758  success:
  1759  	SUBQ SI, DI
  1760  	SUBL $1, DI
  1761  	MOVQ DI, (R8)
  1762  	RET
  1763  
  1764  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  1765  	MOVQ	a_len+8(FP), BX
  1766  	MOVQ	b_len+32(FP), CX
  1767  	CMPQ	BX, CX
  1768  	JNE	eqret
  1769  	MOVQ	a+0(FP), SI
  1770  	MOVQ	b+24(FP), DI
  1771  	LEAQ	ret+48(FP), AX
  1772  	JMP	runtime·memeqbody(SB)
  1773  eqret:
  1774  	MOVB	$0, ret+48(FP)
  1775  	RET
  1776  
  1777  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1778  	get_tls(CX)
  1779  	MOVQ	g(CX), AX
  1780  	MOVQ	g_m(AX), AX
  1781  	MOVL	m_fastrand(AX), DX
  1782  	ADDL	DX, DX
  1783  	MOVL	DX, BX
  1784  	XORL	$0x88888eef, DX
  1785  	CMOVLMI	BX, DX
  1786  	MOVL	DX, m_fastrand(AX)
  1787  	MOVL	DX, ret+0(FP)
  1788  	RET
  1789  
  1790  TEXT runtime·return0(SB), NOSPLIT, $0
  1791  	MOVL	$0, AX
  1792  	RET
  1793  
  1794  
  1795  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1796  // Must obey the gcc calling convention.
  1797  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1798  	get_tls(CX)
  1799  	MOVQ	g(CX), AX
  1800  	MOVQ	g_m(AX), AX
  1801  	MOVQ	m_curg(AX), AX
  1802  	MOVQ	(g_stack+stack_hi)(AX), AX
  1803  	RET
  1804  
  1805  // The top-most function running on a goroutine
  1806  // returns to goexit+PCQuantum.
  1807  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1808  	BYTE	$0x90	// NOP
  1809  	CALL	runtime·goexit1(SB)	// does not return
  1810  	// traceback from goexit1 must hit code range of goexit
  1811  	BYTE	$0x90	// NOP
  1812  
  1813  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  1814  	MOVQ	addr+0(FP), AX
  1815  	PREFETCHT0	(AX)
  1816  	RET
  1817  
  1818  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  1819  	MOVQ	addr+0(FP), AX
  1820  	PREFETCHT1	(AX)
  1821  	RET
  1822  
  1823  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  1824  	MOVQ	addr+0(FP), AX
  1825  	PREFETCHT2	(AX)
  1826  	RET
  1827  
  1828  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  1829  	MOVQ	addr+0(FP), AX
  1830  	PREFETCHNTA	(AX)
  1831  	RET
  1832  
  1833  // This is called from .init_array and follows the platform, not Go, ABI.
  1834  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1835  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  1836  	MOVQ	runtime·lastmoduledatap(SB), AX
  1837  	MOVQ	DI, moduledata_next(AX)
  1838  	MOVQ	DI, runtime·lastmoduledatap(SB)
  1839  	POPQ	R15
  1840  	RET