github.com/gocuntian/go@v0.0.0-20160610041250-fee02d270bf8/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	MOVQ	AX, SI
    32  	CMPQ	AX, $0
    33  	JE	nocpuinfo
    34  
    35  	// Figure out how to serialize RDTSC.
    36  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37  	// Don't know about the rest, so let's do MFENCE.
    38  	CMPL	BX, $0x756E6547  // "Genu"
    39  	JNE	notintel
    40  	CMPL	DX, $0x49656E69  // "ineI"
    41  	JNE	notintel
    42  	CMPL	CX, $0x6C65746E  // "ntel"
    43  	JNE	notintel
    44  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    45  notintel:
    46  
    47  	// Load EAX=1 cpuid flags
    48  	MOVQ	$1, AX
    49  	CPUID
    50  	MOVL	CX, runtime·cpuid_ecx(SB)
    51  	MOVL	DX, runtime·cpuid_edx(SB)
    52  
    53  	// Load EAX=7/ECX=0 cpuid flags
    54  	CMPQ	SI, $7
    55  	JLT	no7
    56  	MOVL	$7, AX
    57  	MOVL	$0, CX
    58  	CPUID
    59  	MOVL	BX, runtime·cpuid_ebx7(SB)
    60  no7:
    61  	// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
    62  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
    63  	// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
    64  	MOVL	runtime·cpuid_ecx(SB), CX
    65  	ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
    66  	CMPL    CX, $0x18000000
    67  	JNE     noavx
    68  	MOVL    $0, CX
    69  	// For XGETBV, OSXSAVE bit is required and sufficient
    70  	XGETBV
    71  	ANDL    $6, AX
    72  	CMPL    AX, $6 // Check for OS support of YMM registers
    73  	JNE     noavx
    74  	MOVB    $1, runtime·support_avx(SB)
    75  	TESTL   $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
    76  	JEQ     noavx2
    77  	MOVB    $1, runtime·support_avx2(SB)
    78  	JMP     nocpuinfo
    79  noavx:
    80  	MOVB    $0, runtime·support_avx(SB)
    81  noavx2:
    82  	MOVB    $0, runtime·support_avx2(SB)
    83  nocpuinfo:	
    84  	
    85  	// if there is an _cgo_init, call it.
    86  	MOVQ	_cgo_init(SB), AX
    87  	TESTQ	AX, AX
    88  	JZ	needtls
    89  	// g0 already in DI
    90  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    91  	MOVQ	$setg_gcc<>(SB), SI
    92  	CALL	AX
    93  
    94  	// update stackguard after _cgo_init
    95  	MOVQ	$runtime·g0(SB), CX
    96  	MOVQ	(g_stack+stack_lo)(CX), AX
    97  	ADDQ	$const__StackGuard, AX
    98  	MOVQ	AX, g_stackguard0(CX)
    99  	MOVQ	AX, g_stackguard1(CX)
   100  
   101  #ifndef GOOS_windows
   102  	JMP ok
   103  #endif
   104  needtls:
   105  #ifdef GOOS_plan9
   106  	// skip TLS setup on Plan 9
   107  	JMP ok
   108  #endif
   109  #ifdef GOOS_solaris
   110  	// skip TLS setup on Solaris
   111  	JMP ok
   112  #endif
   113  
   114  	LEAQ	runtime·m0+m_tls(SB), DI
   115  	CALL	runtime·settls(SB)
   116  
   117  	// store through it, to make sure it works
   118  	get_tls(BX)
   119  	MOVQ	$0x123, g(BX)
   120  	MOVQ	runtime·m0+m_tls(SB), AX
   121  	CMPQ	AX, $0x123
   122  	JEQ 2(PC)
   123  	MOVL	AX, 0	// abort
   124  ok:
   125  	// set the per-goroutine and per-mach "registers"
   126  	get_tls(BX)
   127  	LEAQ	runtime·g0(SB), CX
   128  	MOVQ	CX, g(BX)
   129  	LEAQ	runtime·m0(SB), AX
   130  
   131  	// save m->g0 = g0
   132  	MOVQ	CX, m_g0(AX)
   133  	// save m0 to g0->m
   134  	MOVQ	AX, g_m(CX)
   135  
   136  	CLD				// convention is D is always left cleared
   137  	CALL	runtime·check(SB)
   138  
   139  	MOVL	16(SP), AX		// copy argc
   140  	MOVL	AX, 0(SP)
   141  	MOVQ	24(SP), AX		// copy argv
   142  	MOVQ	AX, 8(SP)
   143  	CALL	runtime·args(SB)
   144  	CALL	runtime·osinit(SB)
   145  	CALL	runtime·schedinit(SB)
   146  
   147  	// create a new goroutine to start program
   148  	MOVQ	$runtime·mainPC(SB), AX		// entry
   149  	PUSHQ	AX
   150  	PUSHQ	$0			// arg size
   151  	CALL	runtime·newproc(SB)
   152  	POPQ	AX
   153  	POPQ	AX
   154  
   155  	// start this M
   156  	CALL	runtime·mstart(SB)
   157  
   158  	MOVL	$0xf1, 0xf1  // crash
   159  	RET
   160  
   161  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   162  GLOBL	runtime·mainPC(SB),RODATA,$8
   163  
   164  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   165  	BYTE	$0xcc
   166  	RET
   167  
   168  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   169  	// No per-thread init.
   170  	RET
   171  
   172  /*
   173   *  go-routine
   174   */
   175  
   176  // void gosave(Gobuf*)
   177  // save state in Gobuf; setjmp
   178  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   179  	MOVQ	buf+0(FP), AX		// gobuf
   180  	LEAQ	buf+0(FP), BX		// caller's SP
   181  	MOVQ	BX, gobuf_sp(AX)
   182  	MOVQ	0(SP), BX		// caller's PC
   183  	MOVQ	BX, gobuf_pc(AX)
   184  	MOVQ	$0, gobuf_ret(AX)
   185  	MOVQ	$0, gobuf_ctxt(AX)
   186  	MOVQ	BP, gobuf_bp(AX)
   187  	get_tls(CX)
   188  	MOVQ	g(CX), BX
   189  	MOVQ	BX, gobuf_g(AX)
   190  	RET
   191  
   192  // void gogo(Gobuf*)
   193  // restore state from Gobuf; longjmp
   194  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   195  	MOVQ	buf+0(FP), BX		// gobuf
   196  	MOVQ	gobuf_g(BX), DX
   197  	MOVQ	0(DX), CX		// make sure g != nil
   198  	get_tls(CX)
   199  	MOVQ	DX, g(CX)
   200  	MOVQ	gobuf_sp(BX), SP	// restore SP
   201  	MOVQ	gobuf_ret(BX), AX
   202  	MOVQ	gobuf_ctxt(BX), DX
   203  	MOVQ	gobuf_bp(BX), BP
   204  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   205  	MOVQ	$0, gobuf_ret(BX)
   206  	MOVQ	$0, gobuf_ctxt(BX)
   207  	MOVQ	$0, gobuf_bp(BX)
   208  	MOVQ	gobuf_pc(BX), BX
   209  	JMP	BX
   210  
   211  // func mcall(fn func(*g))
   212  // Switch to m->g0's stack, call fn(g).
   213  // Fn must never return. It should gogo(&g->sched)
   214  // to keep running g.
   215  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   216  	MOVQ	fn+0(FP), DI
   217  	
   218  	get_tls(CX)
   219  	MOVQ	g(CX), AX	// save state in g->sched
   220  	MOVQ	0(SP), BX	// caller's PC
   221  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   222  	LEAQ	fn+0(FP), BX	// caller's SP
   223  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   224  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   225  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   226  
   227  	// switch to m->g0 & its stack, call fn
   228  	MOVQ	g(CX), BX
   229  	MOVQ	g_m(BX), BX
   230  	MOVQ	m_g0(BX), SI
   231  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   232  	JNE	3(PC)
   233  	MOVQ	$runtime·badmcall(SB), AX
   234  	JMP	AX
   235  	MOVQ	SI, g(CX)	// g = m->g0
   236  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   237  	PUSHQ	AX
   238  	MOVQ	DI, DX
   239  	MOVQ	0(DI), DI
   240  	CALL	DI
   241  	POPQ	AX
   242  	MOVQ	$runtime·badmcall2(SB), AX
   243  	JMP	AX
   244  	RET
   245  
   246  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   247  // of the G stack. We need to distinguish the routine that
   248  // lives at the bottom of the G stack from the one that lives
   249  // at the top of the system stack because the one at the top of
   250  // the system stack terminates the stack walk (see topofstack()).
   251  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   252  	RET
   253  
   254  // func systemstack(fn func())
   255  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   256  	MOVQ	fn+0(FP), DI	// DI = fn
   257  	get_tls(CX)
   258  	MOVQ	g(CX), AX	// AX = g
   259  	MOVQ	g_m(AX), BX	// BX = m
   260  
   261  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   262  	CMPQ	AX, DX
   263  	JEQ	noswitch
   264  
   265  	MOVQ	m_g0(BX), DX	// DX = g0
   266  	CMPQ	AX, DX
   267  	JEQ	noswitch
   268  
   269  	MOVQ	m_curg(BX), R8
   270  	CMPQ	AX, R8
   271  	JEQ	switch
   272  	
   273  	// Bad: g is not gsignal, not g0, not curg. What is it?
   274  	MOVQ	$runtime·badsystemstack(SB), AX
   275  	CALL	AX
   276  
   277  switch:
   278  	// save our state in g->sched. Pretend to
   279  	// be systemstack_switch if the G stack is scanned.
   280  	MOVQ	$runtime·systemstack_switch(SB), SI
   281  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   282  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   283  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   284  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   285  
   286  	// switch to g0
   287  	MOVQ	DX, g(CX)
   288  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   289  	// make it look like mstart called systemstack on g0, to stop traceback
   290  	SUBQ	$8, BX
   291  	MOVQ	$runtime·mstart(SB), DX
   292  	MOVQ	DX, 0(BX)
   293  	MOVQ	BX, SP
   294  
   295  	// call target function
   296  	MOVQ	DI, DX
   297  	MOVQ	0(DI), DI
   298  	CALL	DI
   299  
   300  	// switch back to g
   301  	get_tls(CX)
   302  	MOVQ	g(CX), AX
   303  	MOVQ	g_m(AX), BX
   304  	MOVQ	m_curg(BX), AX
   305  	MOVQ	AX, g(CX)
   306  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   307  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   308  	RET
   309  
   310  noswitch:
   311  	// already on m stack, just call directly
   312  	MOVQ	DI, DX
   313  	MOVQ	0(DI), DI
   314  	CALL	DI
   315  	RET
   316  
   317  /*
   318   * support for morestack
   319   */
   320  
   321  // Called during function prolog when more stack is needed.
   322  //
   323  // The traceback routines see morestack on a g0 as being
   324  // the top of a stack (for example, morestack calling newstack
   325  // calling the scheduler calling newm calling gc), so we must
   326  // record an argument size. For that purpose, it has no arguments.
   327  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   328  	// Cannot grow scheduler stack (m->g0).
   329  	get_tls(CX)
   330  	MOVQ	g(CX), BX
   331  	MOVQ	g_m(BX), BX
   332  	MOVQ	m_g0(BX), SI
   333  	CMPQ	g(CX), SI
   334  	JNE	2(PC)
   335  	INT	$3
   336  
   337  	// Cannot grow signal stack (m->gsignal).
   338  	MOVQ	m_gsignal(BX), SI
   339  	CMPQ	g(CX), SI
   340  	JNE	2(PC)
   341  	INT	$3
   342  
   343  	// Called from f.
   344  	// Set m->morebuf to f's caller.
   345  	MOVQ	8(SP), AX	// f's caller's PC
   346  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   347  	LEAQ	16(SP), AX	// f's caller's SP
   348  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   349  	get_tls(CX)
   350  	MOVQ	g(CX), SI
   351  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   352  
   353  	// Set g->sched to context in f.
   354  	MOVQ	0(SP), AX // f's PC
   355  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   356  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   357  	LEAQ	8(SP), AX // f's SP
   358  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   359  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   360  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   361  
   362  	// Call newstack on m->g0's stack.
   363  	MOVQ	m_g0(BX), BX
   364  	MOVQ	BX, g(CX)
   365  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   366  	CALL	runtime·newstack(SB)
   367  	MOVQ	$0, 0x1003	// crash if newstack returns
   368  	RET
   369  
   370  // morestack but not preserving ctxt.
   371  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   372  	MOVL	$0, DX
   373  	JMP	runtime·morestack(SB)
   374  
   375  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   376  	// We came here via a RET to an overwritten return PC.
   377  	// AX may be live. Other registers are available.
   378  
   379  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   380  	get_tls(CX)
   381  	MOVQ	g(CX), CX
   382  	MOVQ	(g_stkbar+slice_array)(CX), DX
   383  	MOVQ	g_stkbarPos(CX), BX
   384  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   385  	MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
   386  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   387  	// Assert that we're popping the right saved LR.
   388  	ADDQ	$8, R8
   389  	CMPQ	R8, SP
   390  	JEQ	2(PC)
   391  	MOVL	$0, 0
   392  	// Record that this stack barrier was hit.
   393  	ADDQ	$1, g_stkbarPos(CX)
   394  	// Jump to the original return PC.
   395  	JMP	BX
   396  
   397  // reflectcall: call a function with the given argument list
   398  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   399  // we don't have variable-sized frames, so we use a small number
   400  // of constant-sized-frame functions to encode a few bits of size in the pc.
   401  // Caution: ugly multiline assembly macros in your future!
   402  
   403  #define DISPATCH(NAME,MAXSIZE)		\
   404  	CMPQ	CX, $MAXSIZE;		\
   405  	JA	3(PC);			\
   406  	MOVQ	$NAME(SB), AX;		\
   407  	JMP	AX
   408  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   409  
   410  TEXT reflect·call(SB), NOSPLIT, $0-0
   411  	JMP	·reflectcall(SB)
   412  
   413  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   414  	MOVLQZX argsize+24(FP), CX
   415  	// NOTE(rsc): No call16, because CALLFN needs four words
   416  	// of argument space to invoke callwritebarrier.
   417  	DISPATCH(runtime·call32, 32)
   418  	DISPATCH(runtime·call64, 64)
   419  	DISPATCH(runtime·call128, 128)
   420  	DISPATCH(runtime·call256, 256)
   421  	DISPATCH(runtime·call512, 512)
   422  	DISPATCH(runtime·call1024, 1024)
   423  	DISPATCH(runtime·call2048, 2048)
   424  	DISPATCH(runtime·call4096, 4096)
   425  	DISPATCH(runtime·call8192, 8192)
   426  	DISPATCH(runtime·call16384, 16384)
   427  	DISPATCH(runtime·call32768, 32768)
   428  	DISPATCH(runtime·call65536, 65536)
   429  	DISPATCH(runtime·call131072, 131072)
   430  	DISPATCH(runtime·call262144, 262144)
   431  	DISPATCH(runtime·call524288, 524288)
   432  	DISPATCH(runtime·call1048576, 1048576)
   433  	DISPATCH(runtime·call2097152, 2097152)
   434  	DISPATCH(runtime·call4194304, 4194304)
   435  	DISPATCH(runtime·call8388608, 8388608)
   436  	DISPATCH(runtime·call16777216, 16777216)
   437  	DISPATCH(runtime·call33554432, 33554432)
   438  	DISPATCH(runtime·call67108864, 67108864)
   439  	DISPATCH(runtime·call134217728, 134217728)
   440  	DISPATCH(runtime·call268435456, 268435456)
   441  	DISPATCH(runtime·call536870912, 536870912)
   442  	DISPATCH(runtime·call1073741824, 1073741824)
   443  	MOVQ	$runtime·badreflectcall(SB), AX
   444  	JMP	AX
   445  
   446  #define CALLFN(NAME,MAXSIZE)			\
   447  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   448  	NO_LOCAL_POINTERS;			\
   449  	/* copy arguments to stack */		\
   450  	MOVQ	argptr+16(FP), SI;		\
   451  	MOVLQZX argsize+24(FP), CX;		\
   452  	MOVQ	SP, DI;				\
   453  	REP;MOVSB;				\
   454  	/* call function */			\
   455  	MOVQ	f+8(FP), DX;			\
   456  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   457  	CALL	(DX);				\
   458  	/* copy return values back */		\
   459  	MOVQ	argptr+16(FP), DI;		\
   460  	MOVLQZX	argsize+24(FP), CX;		\
   461  	MOVLQZX retoffset+28(FP), BX;		\
   462  	MOVQ	SP, SI;				\
   463  	ADDQ	BX, DI;				\
   464  	ADDQ	BX, SI;				\
   465  	SUBQ	BX, CX;				\
   466  	REP;MOVSB;				\
   467  	/* execute write barrier updates */	\
   468  	MOVQ	argtype+0(FP), DX;		\
   469  	MOVQ	argptr+16(FP), DI;		\
   470  	MOVLQZX	argsize+24(FP), CX;		\
   471  	MOVLQZX retoffset+28(FP), BX;		\
   472  	MOVQ	DX, 0(SP);			\
   473  	MOVQ	DI, 8(SP);			\
   474  	MOVQ	CX, 16(SP);			\
   475  	MOVQ	BX, 24(SP);			\
   476  	CALL	runtime·callwritebarrier(SB);	\
   477  	RET
   478  
   479  CALLFN(·call32, 32)
   480  CALLFN(·call64, 64)
   481  CALLFN(·call128, 128)
   482  CALLFN(·call256, 256)
   483  CALLFN(·call512, 512)
   484  CALLFN(·call1024, 1024)
   485  CALLFN(·call2048, 2048)
   486  CALLFN(·call4096, 4096)
   487  CALLFN(·call8192, 8192)
   488  CALLFN(·call16384, 16384)
   489  CALLFN(·call32768, 32768)
   490  CALLFN(·call65536, 65536)
   491  CALLFN(·call131072, 131072)
   492  CALLFN(·call262144, 262144)
   493  CALLFN(·call524288, 524288)
   494  CALLFN(·call1048576, 1048576)
   495  CALLFN(·call2097152, 2097152)
   496  CALLFN(·call4194304, 4194304)
   497  CALLFN(·call8388608, 8388608)
   498  CALLFN(·call16777216, 16777216)
   499  CALLFN(·call33554432, 33554432)
   500  CALLFN(·call67108864, 67108864)
   501  CALLFN(·call134217728, 134217728)
   502  CALLFN(·call268435456, 268435456)
   503  CALLFN(·call536870912, 536870912)
   504  CALLFN(·call1073741824, 1073741824)
   505  
   506  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   507  	MOVL	cycles+0(FP), AX
   508  again:
   509  	PAUSE
   510  	SUBL	$1, AX
   511  	JNZ	again
   512  	RET
   513  
   514  
   515  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   516  	// Stores are already ordered on x86, so this is just a
   517  	// compile barrier.
   518  	RET
   519  
   520  // void jmpdefer(fn, sp);
   521  // called from deferreturn.
   522  // 1. pop the caller
   523  // 2. sub 5 bytes from the callers return
   524  // 3. jmp to the argument
   525  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   526  	MOVQ	fv+0(FP), DX	// fn
   527  	MOVQ	argp+8(FP), BX	// caller sp
   528  	LEAQ	-8(BX), SP	// caller sp after CALL
   529  	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   530  	SUBQ	$5, (SP)	// return to CALL again
   531  	MOVQ	0(DX), BX
   532  	JMP	BX	// but first run the deferred function
   533  
   534  // Save state of caller into g->sched. Smashes R8, R9.
   535  TEXT gosave<>(SB),NOSPLIT,$0
   536  	get_tls(R8)
   537  	MOVQ	g(R8), R8
   538  	MOVQ	0(SP), R9
   539  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   540  	LEAQ	8(SP), R9
   541  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   542  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   543  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   544  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   545  	RET
   546  
   547  // func asmcgocall(fn, arg unsafe.Pointer) int32
   548  // Call fn(arg) on the scheduler stack,
   549  // aligned appropriately for the gcc ABI.
   550  // See cgocall.go for more details.
   551  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   552  	MOVQ	fn+0(FP), AX
   553  	MOVQ	arg+8(FP), BX
   554  
   555  	MOVQ	SP, DX
   556  
   557  	// Figure out if we need to switch to m->g0 stack.
   558  	// We get called to create new OS threads too, and those
   559  	// come in on the m->g0 stack already.
   560  	get_tls(CX)
   561  	MOVQ	g(CX), R8
   562  	CMPQ	R8, $0
   563  	JEQ	nosave
   564  	MOVQ	g_m(R8), R8
   565  	MOVQ	m_g0(R8), SI
   566  	MOVQ	g(CX), DI
   567  	CMPQ	SI, DI
   568  	JEQ	nosave
   569  	MOVQ	m_gsignal(R8), SI
   570  	CMPQ	SI, DI
   571  	JEQ	nosave
   572  	
   573  	// Switch to system stack.
   574  	MOVQ	m_g0(R8), SI
   575  	CALL	gosave<>(SB)
   576  	MOVQ	SI, g(CX)
   577  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   578  
   579  	// Now on a scheduling stack (a pthread-created stack).
   580  	// Make sure we have enough room for 4 stack-backed fast-call
   581  	// registers as per windows amd64 calling convention.
   582  	SUBQ	$64, SP
   583  	ANDQ	$~15, SP	// alignment for gcc ABI
   584  	MOVQ	DI, 48(SP)	// save g
   585  	MOVQ	(g_stack+stack_hi)(DI), DI
   586  	SUBQ	DX, DI
   587  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   588  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   589  	MOVQ	BX, CX		// CX = first argument in Win64
   590  	CALL	AX
   591  
   592  	// Restore registers, g, stack pointer.
   593  	get_tls(CX)
   594  	MOVQ	48(SP), DI
   595  	MOVQ	(g_stack+stack_hi)(DI), SI
   596  	SUBQ	40(SP), SI
   597  	MOVQ	DI, g(CX)
   598  	MOVQ	SI, SP
   599  
   600  	MOVL	AX, ret+16(FP)
   601  	RET
   602  
   603  nosave:
   604  	// Running on a system stack, perhaps even without a g.
   605  	// Having no g can happen during thread creation or thread teardown
   606  	// (see needm/dropm on Solaris, for example).
   607  	// This code is like the above sequence but without saving/restoring g
   608  	// and without worrying about the stack moving out from under us
   609  	// (because we're on a system stack, not a goroutine stack).
   610  	// The above code could be used directly if already on a system stack,
   611  	// but then the only path through this code would be a rare case on Solaris.
   612  	// Using this code for all "already on system stack" calls exercises it more,
   613  	// which should help keep it correct.
   614  	SUBQ	$64, SP
   615  	ANDQ	$~15, SP
   616  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   617  	MOVQ	DX, 40(SP)	// save original stack pointer
   618  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   619  	MOVQ	BX, CX		// CX = first argument in Win64
   620  	CALL	AX
   621  	MOVQ	40(SP), SI	// restore original stack pointer
   622  	MOVQ	SI, SP
   623  	MOVL	AX, ret+16(FP)
   624  	RET
   625  
   626  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   627  // Turn the fn into a Go func (by taking its address) and call
   628  // cgocallback_gofunc.
   629  TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   630  	LEAQ	fn+0(FP), AX
   631  	MOVQ	AX, 0(SP)
   632  	MOVQ	frame+8(FP), AX
   633  	MOVQ	AX, 8(SP)
   634  	MOVQ	framesize+16(FP), AX
   635  	MOVQ	AX, 16(SP)
   636  	MOVQ	ctxt+24(FP), AX
   637  	MOVQ	AX, 24(SP)
   638  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   639  	CALL	AX
   640  	RET
   641  
   642  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   643  // See cgocall.go for more details.
   644  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   645  	NO_LOCAL_POINTERS
   646  
   647  	// If g is nil, Go did not create the current thread.
   648  	// Call needm to obtain one m for temporary use.
   649  	// In this case, we're running on the thread stack, so there's
   650  	// lots of space, but the linker doesn't know. Hide the call from
   651  	// the linker analysis by using an indirect call through AX.
   652  	get_tls(CX)
   653  #ifdef GOOS_windows
   654  	MOVL	$0, BX
   655  	CMPQ	CX, $0
   656  	JEQ	2(PC)
   657  #endif
   658  	MOVQ	g(CX), BX
   659  	CMPQ	BX, $0
   660  	JEQ	needm
   661  	MOVQ	g_m(BX), BX
   662  	MOVQ	BX, R8 // holds oldm until end of function
   663  	JMP	havem
   664  needm:
   665  	MOVQ	$0, 0(SP)
   666  	MOVQ	$runtime·needm(SB), AX
   667  	CALL	AX
   668  	MOVQ	0(SP), R8
   669  	get_tls(CX)
   670  	MOVQ	g(CX), BX
   671  	MOVQ	g_m(BX), BX
   672  	
   673  	// Set m->sched.sp = SP, so that if a panic happens
   674  	// during the function we are about to execute, it will
   675  	// have a valid SP to run on the g0 stack.
   676  	// The next few lines (after the havem label)
   677  	// will save this SP onto the stack and then write
   678  	// the same SP back to m->sched.sp. That seems redundant,
   679  	// but if an unrecovered panic happens, unwindm will
   680  	// restore the g->sched.sp from the stack location
   681  	// and then systemstack will try to use it. If we don't set it here,
   682  	// that restored SP will be uninitialized (typically 0) and
   683  	// will not be usable.
   684  	MOVQ	m_g0(BX), SI
   685  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   686  
   687  havem:
   688  	// Now there's a valid m, and we're running on its m->g0.
   689  	// Save current m->g0->sched.sp on stack and then set it to SP.
   690  	// Save current sp in m->g0->sched.sp in preparation for
   691  	// switch back to m->curg stack.
   692  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   693  	MOVQ	m_g0(BX), SI
   694  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   695  	MOVQ	AX, 0(SP)
   696  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   697  
   698  	// Switch to m->curg stack and call runtime.cgocallbackg.
   699  	// Because we are taking over the execution of m->curg
   700  	// but *not* resuming what had been running, we need to
   701  	// save that information (m->curg->sched) so we can restore it.
   702  	// We can restore m->curg->sched.sp easily, because calling
   703  	// runtime.cgocallbackg leaves SP unchanged upon return.
   704  	// To save m->curg->sched.pc, we push it onto the stack.
   705  	// This has the added benefit that it looks to the traceback
   706  	// routine like cgocallbackg is going to return to that
   707  	// PC (because the frame we allocate below has the same
   708  	// size as cgocallback_gofunc's frame declared above)
   709  	// so that the traceback will seamlessly trace back into
   710  	// the earlier calls.
   711  	//
   712  	// In the new goroutine, 8(SP) holds the saved R8.
   713  	MOVQ	m_curg(BX), SI
   714  	MOVQ	SI, g(CX)
   715  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   716  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   717  	MOVQ	BX, -8(DI)
   718  	// Compute the size of the frame, including return PC and, if
   719  	// GOEXPERIMENT=framepointer, the saved based pointer
   720  	MOVQ	ctxt+24(FP), BX
   721  	LEAQ	fv+0(FP), AX
   722  	SUBQ	SP, AX
   723  	SUBQ	AX, DI
   724  	MOVQ	DI, SP
   725  
   726  	MOVQ	R8, 8(SP)
   727  	MOVQ	BX, 0(SP)
   728  	CALL	runtime·cgocallbackg(SB)
   729  	MOVQ	8(SP), R8
   730  
   731  	// Compute the size of the frame again. FP and SP have
   732  	// completely different values here than they did above,
   733  	// but only their difference matters.
   734  	LEAQ	fv+0(FP), AX
   735  	SUBQ	SP, AX
   736  
   737  	// Restore g->sched (== m->curg->sched) from saved values.
   738  	get_tls(CX)
   739  	MOVQ	g(CX), SI
   740  	MOVQ	SP, DI
   741  	ADDQ	AX, DI
   742  	MOVQ	-8(DI), BX
   743  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   744  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   745  
   746  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   747  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   748  	// so we do not have to restore it.)
   749  	MOVQ	g(CX), BX
   750  	MOVQ	g_m(BX), BX
   751  	MOVQ	m_g0(BX), SI
   752  	MOVQ	SI, g(CX)
   753  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   754  	MOVQ	0(SP), AX
   755  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   756  	
   757  	// If the m on entry was nil, we called needm above to borrow an m
   758  	// for the duration of the call. Since the call is over, return it with dropm.
   759  	CMPQ	R8, $0
   760  	JNE 3(PC)
   761  	MOVQ	$runtime·dropm(SB), AX
   762  	CALL	AX
   763  
   764  	// Done!
   765  	RET
   766  
   767  // void setg(G*); set g. for use by needm.
   768  TEXT runtime·setg(SB), NOSPLIT, $0-8
   769  	MOVQ	gg+0(FP), BX
   770  #ifdef GOOS_windows
   771  	CMPQ	BX, $0
   772  	JNE	settls
   773  	MOVQ	$0, 0x28(GS)
   774  	RET
   775  settls:
   776  	MOVQ	g_m(BX), AX
   777  	LEAQ	m_tls(AX), AX
   778  	MOVQ	AX, 0x28(GS)
   779  #endif
   780  	get_tls(CX)
   781  	MOVQ	BX, g(CX)
   782  	RET
   783  
   784  // void setg_gcc(G*); set g called from gcc.
   785  TEXT setg_gcc<>(SB),NOSPLIT,$0
   786  	get_tls(AX)
   787  	MOVQ	DI, g(AX)
   788  	RET
   789  
   790  // check that SP is in range [g->stack.lo, g->stack.hi)
   791  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   792  	get_tls(CX)
   793  	MOVQ	g(CX), AX
   794  	CMPQ	(g_stack+stack_hi)(AX), SP
   795  	JHI	2(PC)
   796  	INT	$3
   797  	CMPQ	SP, (g_stack+stack_lo)(AX)
   798  	JHI	2(PC)
   799  	INT	$3
   800  	RET
   801  
   802  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   803  	MOVQ	argp+0(FP),AX		// addr of first arg
   804  	MOVQ	-8(AX),AX		// get calling pc
   805  	CMPQ	AX, runtime·stackBarrierPC(SB)
   806  	JNE	nobar
   807  	// Get original return PC.
   808  	CALL	runtime·nextBarrierPC(SB)
   809  	MOVQ	0(SP), AX
   810  nobar:
   811  	MOVQ	AX, ret+8(FP)
   812  	RET
   813  
   814  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   815  	MOVQ	argp+0(FP),AX		// addr of first arg
   816  	MOVQ	pc+8(FP), BX
   817  	MOVQ	-8(AX), CX
   818  	CMPQ	CX, runtime·stackBarrierPC(SB)
   819  	JEQ	setbar
   820  	MOVQ	BX, -8(AX)		// set calling pc
   821  	RET
   822  setbar:
   823  	// Set the stack barrier return PC.
   824  	MOVQ	BX, 0(SP)
   825  	CALL	runtime·setNextBarrierPC(SB)
   826  	RET
   827  
   828  TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   829  	MOVQ	argp+0(FP), AX
   830  	MOVQ	AX, ret+8(FP)
   831  	RET
   832  
   833  // func cputicks() int64
   834  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   835  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   836  	JNE	mfence
   837  	LFENCE
   838  	JMP	done
   839  mfence:
   840  	MFENCE
   841  done:
   842  	RDTSC
   843  	SHLQ	$32, DX
   844  	ADDQ	DX, AX
   845  	MOVQ	AX, ret+0(FP)
   846  	RET
   847  
   848  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   849  // redirects to memhash(p, h, size) using the size
   850  // stored in the closure.
   851  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   852  	GO_ARGS
   853  	NO_LOCAL_POINTERS
   854  	MOVQ	p+0(FP), AX
   855  	MOVQ	h+8(FP), BX
   856  	MOVQ	8(DX), CX
   857  	MOVQ	AX, 0(SP)
   858  	MOVQ	BX, 8(SP)
   859  	MOVQ	CX, 16(SP)
   860  	CALL	runtime·memhash(SB)
   861  	MOVQ	24(SP), AX
   862  	MOVQ	AX, ret+16(FP)
   863  	RET
   864  
   865  // hash function using AES hardware instructions
   866  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   867  	MOVQ	p+0(FP), AX	// ptr to data
   868  	MOVQ	s+16(FP), CX	// size
   869  	LEAQ	ret+24(FP), DX
   870  	JMP	runtime·aeshashbody(SB)
   871  
   872  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   873  	MOVQ	p+0(FP), AX	// ptr to string struct
   874  	MOVQ	8(AX), CX	// length of string
   875  	MOVQ	(AX), AX	// string data
   876  	LEAQ	ret+16(FP), DX
   877  	JMP	runtime·aeshashbody(SB)
   878  
   879  // AX: data
   880  // CX: length
   881  // DX: address to put return value
   882  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   883  	// Fill an SSE register with our seeds.
   884  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   885  	PINSRW	$4, CX, X0			// 16 bits of length
   886  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   887  	MOVO	X0, X1				// save unscrambled seed
   888  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   889  	AESENC	X0, X0				// scramble seed
   890  
   891  	CMPQ	CX, $16
   892  	JB	aes0to15
   893  	JE	aes16
   894  	CMPQ	CX, $32
   895  	JBE	aes17to32
   896  	CMPQ	CX, $64
   897  	JBE	aes33to64
   898  	CMPQ	CX, $128
   899  	JBE	aes65to128
   900  	JMP	aes129plus
   901  
   902  aes0to15:
   903  	TESTQ	CX, CX
   904  	JE	aes0
   905  
   906  	ADDQ	$16, AX
   907  	TESTW	$0xff0, AX
   908  	JE	endofpage
   909  
   910  	// 16 bytes loaded at this address won't cross
   911  	// a page boundary, so we can load it directly.
   912  	MOVOU	-16(AX), X1
   913  	ADDQ	CX, CX
   914  	MOVQ	$masks<>(SB), AX
   915  	PAND	(AX)(CX*8), X1
   916  final1:
   917  	AESENC	X0, X1	// scramble input, xor in seed
   918  	AESENC	X1, X1  // scramble combo 2 times
   919  	AESENC	X1, X1
   920  	MOVQ	X1, (DX)
   921  	RET
   922  
   923  endofpage:
   924  	// address ends in 1111xxxx. Might be up against
   925  	// a page boundary, so load ending at last byte.
   926  	// Then shift bytes down using pshufb.
   927  	MOVOU	-32(AX)(CX*1), X1
   928  	ADDQ	CX, CX
   929  	MOVQ	$shifts<>(SB), AX
   930  	PSHUFB	(AX)(CX*8), X1
   931  	JMP	final1
   932  
   933  aes0:
   934  	// Return scrambled input seed
   935  	AESENC	X0, X0
   936  	MOVQ	X0, (DX)
   937  	RET
   938  
   939  aes16:
   940  	MOVOU	(AX), X1
   941  	JMP	final1
   942  
   943  aes17to32:
   944  	// make second starting seed
   945  	PXOR	runtime·aeskeysched+16(SB), X1
   946  	AESENC	X1, X1
   947  	
   948  	// load data to be hashed
   949  	MOVOU	(AX), X2
   950  	MOVOU	-16(AX)(CX*1), X3
   951  
   952  	// scramble 3 times
   953  	AESENC	X0, X2
   954  	AESENC	X1, X3
   955  	AESENC	X2, X2
   956  	AESENC	X3, X3
   957  	AESENC	X2, X2
   958  	AESENC	X3, X3
   959  
   960  	// combine results
   961  	PXOR	X3, X2
   962  	MOVQ	X2, (DX)
   963  	RET
   964  
   965  aes33to64:
   966  	// make 3 more starting seeds
   967  	MOVO	X1, X2
   968  	MOVO	X1, X3
   969  	PXOR	runtime·aeskeysched+16(SB), X1
   970  	PXOR	runtime·aeskeysched+32(SB), X2
   971  	PXOR	runtime·aeskeysched+48(SB), X3
   972  	AESENC	X1, X1
   973  	AESENC	X2, X2
   974  	AESENC	X3, X3
   975  	
   976  	MOVOU	(AX), X4
   977  	MOVOU	16(AX), X5
   978  	MOVOU	-32(AX)(CX*1), X6
   979  	MOVOU	-16(AX)(CX*1), X7
   980  	
   981  	AESENC	X0, X4
   982  	AESENC	X1, X5
   983  	AESENC	X2, X6
   984  	AESENC	X3, X7
   985  	
   986  	AESENC	X4, X4
   987  	AESENC	X5, X5
   988  	AESENC	X6, X6
   989  	AESENC	X7, X7
   990  	
   991  	AESENC	X4, X4
   992  	AESENC	X5, X5
   993  	AESENC	X6, X6
   994  	AESENC	X7, X7
   995  
   996  	PXOR	X6, X4
   997  	PXOR	X7, X5
   998  	PXOR	X5, X4
   999  	MOVQ	X4, (DX)
  1000  	RET
  1001  
  1002  aes65to128:
  1003  	// make 7 more starting seeds
  1004  	MOVO	X1, X2
  1005  	MOVO	X1, X3
  1006  	MOVO	X1, X4
  1007  	MOVO	X1, X5
  1008  	MOVO	X1, X6
  1009  	MOVO	X1, X7
  1010  	PXOR	runtime·aeskeysched+16(SB), X1
  1011  	PXOR	runtime·aeskeysched+32(SB), X2
  1012  	PXOR	runtime·aeskeysched+48(SB), X3
  1013  	PXOR	runtime·aeskeysched+64(SB), X4
  1014  	PXOR	runtime·aeskeysched+80(SB), X5
  1015  	PXOR	runtime·aeskeysched+96(SB), X6
  1016  	PXOR	runtime·aeskeysched+112(SB), X7
  1017  	AESENC	X1, X1
  1018  	AESENC	X2, X2
  1019  	AESENC	X3, X3
  1020  	AESENC	X4, X4
  1021  	AESENC	X5, X5
  1022  	AESENC	X6, X6
  1023  	AESENC	X7, X7
  1024  
  1025  	// load data
  1026  	MOVOU	(AX), X8
  1027  	MOVOU	16(AX), X9
  1028  	MOVOU	32(AX), X10
  1029  	MOVOU	48(AX), X11
  1030  	MOVOU	-64(AX)(CX*1), X12
  1031  	MOVOU	-48(AX)(CX*1), X13
  1032  	MOVOU	-32(AX)(CX*1), X14
  1033  	MOVOU	-16(AX)(CX*1), X15
  1034  
  1035  	// scramble data, xor in seed
  1036  	AESENC	X0, X8
  1037  	AESENC	X1, X9
  1038  	AESENC	X2, X10
  1039  	AESENC	X3, X11
  1040  	AESENC	X4, X12
  1041  	AESENC	X5, X13
  1042  	AESENC	X6, X14
  1043  	AESENC	X7, X15
  1044  
  1045  	// scramble twice
  1046  	AESENC	X8, X8
  1047  	AESENC	X9, X9
  1048  	AESENC	X10, X10
  1049  	AESENC	X11, X11
  1050  	AESENC	X12, X12
  1051  	AESENC	X13, X13
  1052  	AESENC	X14, X14
  1053  	AESENC	X15, X15
  1054  	
  1055  	AESENC	X8, X8
  1056  	AESENC	X9, X9
  1057  	AESENC	X10, X10
  1058  	AESENC	X11, X11
  1059  	AESENC	X12, X12
  1060  	AESENC	X13, X13
  1061  	AESENC	X14, X14
  1062  	AESENC	X15, X15
  1063  
  1064  	// combine results
  1065  	PXOR	X12, X8
  1066  	PXOR	X13, X9
  1067  	PXOR	X14, X10
  1068  	PXOR	X15, X11
  1069  	PXOR	X10, X8
  1070  	PXOR	X11, X9
  1071  	PXOR	X9, X8
  1072  	MOVQ	X8, (DX)
  1073  	RET
  1074  
  1075  aes129plus:
  1076  	// make 7 more starting seeds
  1077  	MOVO	X1, X2
  1078  	MOVO	X1, X3
  1079  	MOVO	X1, X4
  1080  	MOVO	X1, X5
  1081  	MOVO	X1, X6
  1082  	MOVO	X1, X7
  1083  	PXOR	runtime·aeskeysched+16(SB), X1
  1084  	PXOR	runtime·aeskeysched+32(SB), X2
  1085  	PXOR	runtime·aeskeysched+48(SB), X3
  1086  	PXOR	runtime·aeskeysched+64(SB), X4
  1087  	PXOR	runtime·aeskeysched+80(SB), X5
  1088  	PXOR	runtime·aeskeysched+96(SB), X6
  1089  	PXOR	runtime·aeskeysched+112(SB), X7
  1090  	AESENC	X1, X1
  1091  	AESENC	X2, X2
  1092  	AESENC	X3, X3
  1093  	AESENC	X4, X4
  1094  	AESENC	X5, X5
  1095  	AESENC	X6, X6
  1096  	AESENC	X7, X7
  1097  	
  1098  	// start with last (possibly overlapping) block
  1099  	MOVOU	-128(AX)(CX*1), X8
  1100  	MOVOU	-112(AX)(CX*1), X9
  1101  	MOVOU	-96(AX)(CX*1), X10
  1102  	MOVOU	-80(AX)(CX*1), X11
  1103  	MOVOU	-64(AX)(CX*1), X12
  1104  	MOVOU	-48(AX)(CX*1), X13
  1105  	MOVOU	-32(AX)(CX*1), X14
  1106  	MOVOU	-16(AX)(CX*1), X15
  1107  
  1108  	// scramble input once, xor in seed
  1109  	AESENC	X0, X8
  1110  	AESENC	X1, X9
  1111  	AESENC	X2, X10
  1112  	AESENC	X3, X11
  1113  	AESENC	X4, X12
  1114  	AESENC	X5, X13
  1115  	AESENC	X6, X14
  1116  	AESENC	X7, X15
  1117  	
  1118  	// compute number of remaining 128-byte blocks
  1119  	DECQ	CX
  1120  	SHRQ	$7, CX
  1121  	
  1122  aesloop:
  1123  	// scramble state, xor in a block
  1124  	MOVOU	(AX), X0
  1125  	MOVOU	16(AX), X1
  1126  	MOVOU	32(AX), X2
  1127  	MOVOU	48(AX), X3
  1128  	AESENC	X0, X8
  1129  	AESENC	X1, X9
  1130  	AESENC	X2, X10
  1131  	AESENC	X3, X11
  1132  	MOVOU	64(AX), X4
  1133  	MOVOU	80(AX), X5
  1134  	MOVOU	96(AX), X6
  1135  	MOVOU	112(AX), X7
  1136  	AESENC	X4, X12
  1137  	AESENC	X5, X13
  1138  	AESENC	X6, X14
  1139  	AESENC	X7, X15
  1140  
  1141  	// scramble state
  1142  	AESENC	X8, X8
  1143  	AESENC	X9, X9
  1144  	AESENC	X10, X10
  1145  	AESENC	X11, X11
  1146  	AESENC	X12, X12
  1147  	AESENC	X13, X13
  1148  	AESENC	X14, X14
  1149  	AESENC	X15, X15
  1150  
  1151  	ADDQ	$128, AX
  1152  	DECQ	CX
  1153  	JNE	aesloop
  1154  
  1155  	// 2 more scrambles to finish
  1156  	AESENC	X8, X8
  1157  	AESENC	X9, X9
  1158  	AESENC	X10, X10
  1159  	AESENC	X11, X11
  1160  	AESENC	X12, X12
  1161  	AESENC	X13, X13
  1162  	AESENC	X14, X14
  1163  	AESENC	X15, X15
  1164  	AESENC	X8, X8
  1165  	AESENC	X9, X9
  1166  	AESENC	X10, X10
  1167  	AESENC	X11, X11
  1168  	AESENC	X12, X12
  1169  	AESENC	X13, X13
  1170  	AESENC	X14, X14
  1171  	AESENC	X15, X15
  1172  
  1173  	PXOR	X12, X8
  1174  	PXOR	X13, X9
  1175  	PXOR	X14, X10
  1176  	PXOR	X15, X11
  1177  	PXOR	X10, X8
  1178  	PXOR	X11, X9
  1179  	PXOR	X9, X8
  1180  	MOVQ	X8, (DX)
  1181  	RET
  1182  	
  1183  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1184  	MOVQ	p+0(FP), AX	// ptr to data
  1185  	MOVQ	h+8(FP), X0	// seed
  1186  	PINSRD	$2, (AX), X0	// data
  1187  	AESENC	runtime·aeskeysched+0(SB), X0
  1188  	AESENC	runtime·aeskeysched+16(SB), X0
  1189  	AESENC	runtime·aeskeysched+32(SB), X0
  1190  	MOVQ	X0, ret+16(FP)
  1191  	RET
  1192  
  1193  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1194  	MOVQ	p+0(FP), AX	// ptr to data
  1195  	MOVQ	h+8(FP), X0	// seed
  1196  	PINSRQ	$1, (AX), X0	// data
  1197  	AESENC	runtime·aeskeysched+0(SB), X0
  1198  	AESENC	runtime·aeskeysched+16(SB), X0
  1199  	AESENC	runtime·aeskeysched+32(SB), X0
  1200  	MOVQ	X0, ret+16(FP)
  1201  	RET
  1202  
  1203  // simple mask to get rid of data in the high part of the register.
  1204  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1205  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1206  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1207  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1208  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1209  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1210  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1211  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1212  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1213  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1214  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1215  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1216  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1217  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1218  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1219  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1220  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1221  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1222  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1223  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1224  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1225  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1226  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1227  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1228  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1229  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1230  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1231  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1232  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1233  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1234  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1235  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1236  GLOBL masks<>(SB),RODATA,$256
  1237  
  1238  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1239  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1240  	MOVQ	$masks<>(SB), AX
  1241  	MOVQ	$shifts<>(SB), BX
  1242  	ORQ	BX, AX
  1243  	TESTQ	$15, AX
  1244  	SETEQ	ret+0(FP)
  1245  	RET
  1246  
  1247  // these are arguments to pshufb. They move data down from
  1248  // the high bytes of the register to the low bytes of the register.
  1249  // index is how many bytes to move.
  1250  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1251  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1252  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1253  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1254  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1255  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1256  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1257  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1258  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1259  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1260  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1261  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1262  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1263  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1264  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1265  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1266  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1267  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1268  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1269  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1270  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1271  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1272  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1273  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1274  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1275  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1276  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1277  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1278  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1279  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1280  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1281  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1282  GLOBL shifts<>(SB),RODATA,$256
  1283  
  1284  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1285  TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1286  	MOVQ	a+0(FP), SI
  1287  	MOVQ	b+8(FP), DI
  1288  	CMPQ	SI, DI
  1289  	JEQ	eq
  1290  	MOVQ	size+16(FP), BX
  1291  	LEAQ	ret+24(FP), AX
  1292  	JMP	runtime·memeqbody(SB)
  1293  eq:
  1294  	MOVB	$1, ret+24(FP)
  1295  	RET
  1296  
  1297  // memequal_varlen(a, b unsafe.Pointer) bool
  1298  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1299  	MOVQ	a+0(FP), SI
  1300  	MOVQ	b+8(FP), DI
  1301  	CMPQ	SI, DI
  1302  	JEQ	eq
  1303  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1304  	LEAQ	ret+16(FP), AX
  1305  	JMP	runtime·memeqbody(SB)
  1306  eq:
  1307  	MOVB	$1, ret+16(FP)
  1308  	RET
  1309  
  1310  // eqstring tests whether two strings are equal.
  1311  // The compiler guarantees that strings passed
  1312  // to eqstring have equal length.
  1313  // See runtime_test.go:eqstring_generic for
  1314  // equivalent Go code.
  1315  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1316  	MOVQ	s1str+0(FP), SI
  1317  	MOVQ	s2str+16(FP), DI
  1318  	CMPQ	SI, DI
  1319  	JEQ	eq
  1320  	MOVQ	s1len+8(FP), BX
  1321  	LEAQ	v+32(FP), AX
  1322  	JMP	runtime·memeqbody(SB)
  1323  eq:
  1324  	MOVB	$1, v+32(FP)
  1325  	RET
  1326  
  1327  // a in SI
  1328  // b in DI
  1329  // count in BX
  1330  // address of result byte in AX
  1331  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1332  	CMPQ	BX, $8
  1333  	JB	small
  1334  	CMPQ	BX, $64
  1335  	JB	bigloop
  1336  	CMPB    runtime·support_avx2(SB), $1
  1337  	JE	hugeloop_avx2
  1338  	
  1339  	// 64 bytes at a time using xmm registers
  1340  hugeloop:
  1341  	CMPQ	BX, $64
  1342  	JB	bigloop
  1343  	MOVOU	(SI), X0
  1344  	MOVOU	(DI), X1
  1345  	MOVOU	16(SI), X2
  1346  	MOVOU	16(DI), X3
  1347  	MOVOU	32(SI), X4
  1348  	MOVOU	32(DI), X5
  1349  	MOVOU	48(SI), X6
  1350  	MOVOU	48(DI), X7
  1351  	PCMPEQB	X1, X0
  1352  	PCMPEQB	X3, X2
  1353  	PCMPEQB	X5, X4
  1354  	PCMPEQB	X7, X6
  1355  	PAND	X2, X0
  1356  	PAND	X6, X4
  1357  	PAND	X4, X0
  1358  	PMOVMSKB X0, DX
  1359  	ADDQ	$64, SI
  1360  	ADDQ	$64, DI
  1361  	SUBQ	$64, BX
  1362  	CMPL	DX, $0xffff
  1363  	JEQ	hugeloop
  1364  	MOVB	$0, (AX)
  1365  	RET
  1366  
  1367  	// 64 bytes at a time using ymm registers
  1368  hugeloop_avx2:
  1369  	CMPQ	BX, $64
  1370  	JB	bigloop_avx2
  1371  	VMOVDQU	(SI), Y0
  1372  	VMOVDQU	(DI), Y1
  1373  	VMOVDQU	32(SI), Y2
  1374  	VMOVDQU	32(DI), Y3
  1375  	VPCMPEQB	Y1, Y0, Y4
  1376  	VPCMPEQB	Y2, Y3, Y5
  1377  	VPAND	Y4, Y5, Y6
  1378  	VPMOVMSKB Y6, DX
  1379  	ADDQ	$64, SI
  1380  	ADDQ	$64, DI
  1381  	SUBQ	$64, BX
  1382  	CMPL	DX, $0xffffffff
  1383  	JEQ	hugeloop_avx2
  1384  	VZEROUPPER
  1385  	MOVB	$0, (AX)
  1386  	RET
  1387  
  1388  bigloop_avx2:
  1389  	VZEROUPPER
  1390  
  1391  	// 8 bytes at a time using 64-bit register
  1392  bigloop:
  1393  	CMPQ	BX, $8
  1394  	JBE	leftover
  1395  	MOVQ	(SI), CX
  1396  	MOVQ	(DI), DX
  1397  	ADDQ	$8, SI
  1398  	ADDQ	$8, DI
  1399  	SUBQ	$8, BX
  1400  	CMPQ	CX, DX
  1401  	JEQ	bigloop
  1402  	MOVB	$0, (AX)
  1403  	RET
  1404  
  1405  	// remaining 0-8 bytes
  1406  leftover:
  1407  	MOVQ	-8(SI)(BX*1), CX
  1408  	MOVQ	-8(DI)(BX*1), DX
  1409  	CMPQ	CX, DX
  1410  	SETEQ	(AX)
  1411  	RET
  1412  
  1413  small:
  1414  	CMPQ	BX, $0
  1415  	JEQ	equal
  1416  
  1417  	LEAQ	0(BX*8), CX
  1418  	NEGQ	CX
  1419  
  1420  	CMPB	SI, $0xf8
  1421  	JA	si_high
  1422  
  1423  	// load at SI won't cross a page boundary.
  1424  	MOVQ	(SI), SI
  1425  	JMP	si_finish
  1426  si_high:
  1427  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1428  	MOVQ	-8(SI)(BX*1), SI
  1429  	SHRQ	CX, SI
  1430  si_finish:
  1431  
  1432  	// same for DI.
  1433  	CMPB	DI, $0xf8
  1434  	JA	di_high
  1435  	MOVQ	(DI), DI
  1436  	JMP	di_finish
  1437  di_high:
  1438  	MOVQ	-8(DI)(BX*1), DI
  1439  	SHRQ	CX, DI
  1440  di_finish:
  1441  
  1442  	SUBQ	SI, DI
  1443  	SHLQ	CX, DI
  1444  equal:
  1445  	SETEQ	(AX)
  1446  	RET
  1447  
  1448  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1449  	MOVQ	s1_base+0(FP), SI
  1450  	MOVQ	s1_len+8(FP), BX
  1451  	MOVQ	s2_base+16(FP), DI
  1452  	MOVQ	s2_len+24(FP), DX
  1453  	LEAQ	ret+32(FP), R9
  1454  	JMP	runtime·cmpbody(SB)
  1455  
  1456  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1457  	MOVQ	s1+0(FP), SI
  1458  	MOVQ	s1+8(FP), BX
  1459  	MOVQ	s2+24(FP), DI
  1460  	MOVQ	s2+32(FP), DX
  1461  	LEAQ	res+48(FP), R9
  1462  	JMP	runtime·cmpbody(SB)
  1463  
  1464  // input:
  1465  //   SI = a
  1466  //   DI = b
  1467  //   BX = alen
  1468  //   DX = blen
  1469  //   R9 = address of output word (stores -1/0/1 here)
  1470  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1471  	CMPQ	SI, DI
  1472  	JEQ	allsame
  1473  	CMPQ	BX, DX
  1474  	MOVQ	DX, R8
  1475  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1476  	CMPQ	R8, $8
  1477  	JB	small
  1478  
  1479  	CMPQ	R8, $63
  1480  	JBE	loop
  1481  	CMPB    runtime·support_avx2(SB), $1
  1482  	JEQ     big_loop_avx2
  1483  	JMP	big_loop
  1484  loop:
  1485  	CMPQ	R8, $16
  1486  	JBE	_0through16
  1487  	MOVOU	(SI), X0
  1488  	MOVOU	(DI), X1
  1489  	PCMPEQB X0, X1
  1490  	PMOVMSKB X1, AX
  1491  	XORQ	$0xffff, AX	// convert EQ to NE
  1492  	JNE	diff16	// branch if at least one byte is not equal
  1493  	ADDQ	$16, SI
  1494  	ADDQ	$16, DI
  1495  	SUBQ	$16, R8
  1496  	JMP	loop
  1497  	
  1498  diff64:
  1499  	ADDQ	$48, SI
  1500  	ADDQ	$48, DI
  1501  	JMP	diff16
  1502  diff48:
  1503  	ADDQ	$32, SI
  1504  	ADDQ	$32, DI
  1505  	JMP	diff16
  1506  diff32:
  1507  	ADDQ	$16, SI
  1508  	ADDQ	$16, DI
  1509  	// AX = bit mask of differences
  1510  diff16:
  1511  	BSFQ	AX, BX	// index of first byte that differs
  1512  	XORQ	AX, AX
  1513  	MOVB	(SI)(BX*1), CX
  1514  	CMPB	CX, (DI)(BX*1)
  1515  	SETHI	AX
  1516  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1517  	MOVQ	AX, (R9)
  1518  	RET
  1519  
  1520  	// 0 through 16 bytes left, alen>=8, blen>=8
  1521  _0through16:
  1522  	CMPQ	R8, $8
  1523  	JBE	_0through8
  1524  	MOVQ	(SI), AX
  1525  	MOVQ	(DI), CX
  1526  	CMPQ	AX, CX
  1527  	JNE	diff8
  1528  _0through8:
  1529  	MOVQ	-8(SI)(R8*1), AX
  1530  	MOVQ	-8(DI)(R8*1), CX
  1531  	CMPQ	AX, CX
  1532  	JEQ	allsame
  1533  
  1534  	// AX and CX contain parts of a and b that differ.
  1535  diff8:
  1536  	BSWAPQ	AX	// reverse order of bytes
  1537  	BSWAPQ	CX
  1538  	XORQ	AX, CX
  1539  	BSRQ	CX, CX	// index of highest bit difference
  1540  	SHRQ	CX, AX	// move a's bit to bottom
  1541  	ANDQ	$1, AX	// mask bit
  1542  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1543  	MOVQ	AX, (R9)
  1544  	RET
  1545  
  1546  	// 0-7 bytes in common
  1547  small:
  1548  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1549  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1550  	JEQ	allsame
  1551  
  1552  	// load bytes of a into high bytes of AX
  1553  	CMPB	SI, $0xf8
  1554  	JA	si_high
  1555  	MOVQ	(SI), SI
  1556  	JMP	si_finish
  1557  si_high:
  1558  	MOVQ	-8(SI)(R8*1), SI
  1559  	SHRQ	CX, SI
  1560  si_finish:
  1561  	SHLQ	CX, SI
  1562  
  1563  	// load bytes of b in to high bytes of BX
  1564  	CMPB	DI, $0xf8
  1565  	JA	di_high
  1566  	MOVQ	(DI), DI
  1567  	JMP	di_finish
  1568  di_high:
  1569  	MOVQ	-8(DI)(R8*1), DI
  1570  	SHRQ	CX, DI
  1571  di_finish:
  1572  	SHLQ	CX, DI
  1573  
  1574  	BSWAPQ	SI	// reverse order of bytes
  1575  	BSWAPQ	DI
  1576  	XORQ	SI, DI	// find bit differences
  1577  	JEQ	allsame
  1578  	BSRQ	DI, CX	// index of highest bit difference
  1579  	SHRQ	CX, SI	// move a's bit to bottom
  1580  	ANDQ	$1, SI	// mask bit
  1581  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1582  	MOVQ	AX, (R9)
  1583  	RET
  1584  
  1585  allsame:
  1586  	XORQ	AX, AX
  1587  	XORQ	CX, CX
  1588  	CMPQ	BX, DX
  1589  	SETGT	AX	// 1 if alen > blen
  1590  	SETEQ	CX	// 1 if alen == blen
  1591  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1592  	MOVQ	AX, (R9)
  1593  	RET
  1594  
  1595  	// this works for >= 64 bytes of data.
  1596  big_loop:
  1597  	MOVOU	(SI), X0
  1598  	MOVOU	(DI), X1
  1599  	PCMPEQB X0, X1
  1600  	PMOVMSKB X1, AX
  1601  	XORQ	$0xffff, AX
  1602  	JNE	diff16
  1603  
  1604  	MOVOU	16(SI), X0
  1605  	MOVOU	16(DI), X1
  1606  	PCMPEQB X0, X1
  1607  	PMOVMSKB X1, AX
  1608  	XORQ	$0xffff, AX
  1609  	JNE	diff32
  1610  
  1611  	MOVOU	32(SI), X0
  1612  	MOVOU	32(DI), X1
  1613  	PCMPEQB X0, X1
  1614  	PMOVMSKB X1, AX
  1615  	XORQ	$0xffff, AX
  1616  	JNE	diff48
  1617  
  1618  	MOVOU	48(SI), X0
  1619  	MOVOU	48(DI), X1
  1620  	PCMPEQB X0, X1
  1621  	PMOVMSKB X1, AX
  1622  	XORQ	$0xffff, AX
  1623  	JNE	diff64
  1624  
  1625  	ADDQ	$64, SI
  1626  	ADDQ	$64, DI
  1627  	SUBQ	$64, R8
  1628  	CMPQ	R8, $64
  1629  	JBE	loop
  1630  	JMP	big_loop
  1631  
  1632  	// Compare 64-bytes per loop iteration.
  1633  	// Loop is unrolled and uses AVX2.
  1634  big_loop_avx2:
  1635  	VMOVDQU	(SI), Y2
  1636  	VMOVDQU	(DI), Y3
  1637  	VMOVDQU	32(SI), Y4
  1638  	VMOVDQU	32(DI), Y5
  1639  	VPCMPEQB Y2, Y3, Y0
  1640  	VPMOVMSKB Y0, AX
  1641  	XORL	$0xffffffff, AX
  1642  	JNE	diff32_avx2
  1643  	VPCMPEQB Y4, Y5, Y6
  1644  	VPMOVMSKB Y6, AX
  1645  	XORL	$0xffffffff, AX
  1646  	JNE	diff64_avx2
  1647  
  1648  	ADDQ	$64, SI
  1649  	ADDQ	$64, DI
  1650  	SUBQ	$64, R8
  1651  	CMPQ	R8, $64
  1652  	JB	big_loop_avx2_exit
  1653  	JMP	big_loop_avx2
  1654  
  1655  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1656  diff32_avx2:
  1657  	VZEROUPPER
  1658  	JMP diff16
  1659  
  1660  	// Same as diff32_avx2, but for last 32 bytes.
  1661  diff64_avx2:
  1662  	VZEROUPPER
  1663  	JMP diff48
  1664  
  1665  	// For <64 bytes remainder jump to normal loop.
  1666  big_loop_avx2_exit:
  1667  	VZEROUPPER
  1668  	JMP loop
  1669  
  1670  
  1671  // TODO: Also use this in bytes.Index
  1672  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1673  	MOVQ s+0(FP), DI
  1674  	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1675  	MOVQ s_len+8(FP), DX
  1676  	MOVQ c+16(FP), BP
  1677  	MOVQ c_len+24(FP), AX
  1678  	CMPQ AX, DX
  1679  	JA fail
  1680  	CMPQ DX, $16
  1681  	JAE sse42
  1682  no_sse42:
  1683  	CMPQ AX, $2
  1684  	JA   _3_or_more
  1685  	MOVW (BP), BP
  1686  	LEAQ -1(DI)(DX*1), DX
  1687  loop2:
  1688  	MOVW (DI), SI
  1689  	CMPW SI,BP
  1690  	JZ success
  1691  	ADDQ $1,DI
  1692  	CMPQ DI,DX
  1693  	JB loop2
  1694  	JMP fail
  1695  _3_or_more:
  1696  	CMPQ AX, $3
  1697  	JA   _4_or_more
  1698  	MOVW 1(BP), BX
  1699  	MOVW (BP), BP
  1700  	LEAQ -2(DI)(DX*1), DX
  1701  loop3:
  1702  	MOVW (DI), SI
  1703  	CMPW SI,BP
  1704  	JZ   partial_success3
  1705  	ADDQ $1,DI
  1706  	CMPQ DI,DX
  1707  	JB loop3
  1708  	JMP fail
  1709  partial_success3:
  1710  	MOVW 1(DI), SI
  1711  	CMPW SI,BX
  1712  	JZ success
  1713  	ADDQ $1,DI
  1714  	CMPQ DI,DX
  1715  	JB loop3
  1716  	JMP fail
  1717  _4_or_more:
  1718  	CMPQ AX, $4
  1719  	JA   _5_or_more
  1720  	MOVL (BP), BP
  1721  	LEAQ -3(DI)(DX*1), DX
  1722  loop4:
  1723  	MOVL (DI), SI
  1724  	CMPL SI,BP
  1725  	JZ   success
  1726  	ADDQ $1,DI
  1727  	CMPQ DI,DX
  1728  	JB loop4
  1729  	JMP fail
  1730  _5_or_more:
  1731  	CMPQ AX, $7
  1732  	JA   _8_or_more
  1733  	LEAQ 1(DI)(DX*1), DX
  1734  	SUBQ AX, DX
  1735  	MOVL -4(BP)(AX*1), BX
  1736  	MOVL (BP), BP
  1737  loop5to7:
  1738  	MOVL (DI), SI
  1739  	CMPL SI,BP
  1740  	JZ   partial_success5to7
  1741  	ADDQ $1,DI
  1742  	CMPQ DI,DX
  1743  	JB loop5to7
  1744  	JMP fail
  1745  partial_success5to7:
  1746  	MOVL -4(AX)(DI*1), SI
  1747  	CMPL SI,BX
  1748  	JZ success
  1749  	ADDQ $1,DI
  1750  	CMPQ DI,DX
  1751  	JB loop5to7
  1752  	JMP fail
  1753  _8_or_more:
  1754  	CMPQ AX, $8
  1755  	JA   _9_or_more
  1756  	MOVQ (BP), BP
  1757  	LEAQ -7(DI)(DX*1), DX
  1758  loop8:
  1759  	MOVQ (DI), SI
  1760  	CMPQ SI,BP
  1761  	JZ   success
  1762  	ADDQ $1,DI
  1763  	CMPQ DI,DX
  1764  	JB loop8
  1765  	JMP fail
  1766  _9_or_more:
  1767  	CMPQ AX, $16
  1768  	JA   _16_or_more
  1769  	LEAQ 1(DI)(DX*1), DX
  1770  	SUBQ AX, DX
  1771  	MOVQ -8(BP)(AX*1), BX
  1772  	MOVQ (BP), BP
  1773  loop9to15:
  1774  	MOVQ (DI), SI
  1775  	CMPQ SI,BP
  1776  	JZ   partial_success9to15
  1777  	ADDQ $1,DI
  1778  	CMPQ DI,DX
  1779  	JB loop9to15
  1780  	JMP fail
  1781  partial_success9to15:
  1782  	MOVQ -8(AX)(DI*1), SI
  1783  	CMPQ SI,BX
  1784  	JZ success
  1785  	ADDQ $1,DI
  1786  	CMPQ DI,DX
  1787  	JB loop9to15
  1788  	JMP fail
  1789  _16_or_more:
  1790  	CMPQ AX, $16
  1791  	JA   _17_to_31
  1792  	MOVOU (BP), X1
  1793  	LEAQ -15(DI)(DX*1), DX
  1794  loop16:
  1795  	MOVOU (DI), X2
  1796  	PCMPEQB X1, X2
  1797  	PMOVMSKB X2, SI
  1798  	CMPQ  SI, $0xffff
  1799  	JE   success
  1800  	ADDQ $1,DI
  1801  	CMPQ DI,DX
  1802  	JB loop16
  1803  	JMP fail
  1804  _17_to_31:
  1805  	LEAQ 1(DI)(DX*1), DX
  1806  	SUBQ AX, DX
  1807  	MOVOU -16(BP)(AX*1), X0
  1808  	MOVOU (BP), X1
  1809  loop17to31:
  1810  	MOVOU (DI), X2
  1811  	PCMPEQB X1,X2
  1812  	PMOVMSKB X2, SI
  1813  	CMPQ  SI, $0xffff
  1814  	JE   partial_success17to31
  1815  	ADDQ $1,DI
  1816  	CMPQ DI,DX
  1817  	JB loop17to31
  1818  	JMP fail
  1819  partial_success17to31:
  1820  	MOVOU -16(AX)(DI*1), X3
  1821  	PCMPEQB X0, X3
  1822  	PMOVMSKB X3, SI
  1823  	CMPQ  SI, $0xffff
  1824  	JE success
  1825  	ADDQ $1,DI
  1826  	CMPQ DI,DX
  1827  	JB loop17to31
  1828  fail:
  1829  	MOVQ $-1, ret+32(FP)
  1830  	RET
  1831  sse42:
  1832  	MOVL runtime·cpuid_ecx(SB), CX
  1833  	ANDL $0x100000, CX
  1834  	JZ no_sse42
  1835  	CMPQ AX, $12
  1836  	// PCMPESTRI is slower than normal compare,
  1837  	// so using it makes sense only if we advance 4+ bytes per compare
  1838  	// This value was determined experimentally and is the ~same
  1839  	// on Nehalem (first with SSE42) and Haswell.
  1840  	JAE _9_or_more
  1841  	LEAQ 16(BP), SI
  1842  	TESTW $0xff0, SI
  1843  	JEQ no_sse42
  1844  	MOVOU (BP), X1
  1845  	LEAQ -15(DI)(DX*1), SI
  1846  	MOVQ $16, R9
  1847  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1848  loop_sse42:
  1849  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1850  	// for equality (bits 2,3 are 11)
  1851  	// result is not masked or inverted (bits 4,5 are 00)
  1852  	// and corresponds to first matching byte (bit 6 is 0)
  1853  	PCMPESTRI $0x0c, (DI), X1
  1854  	// CX == 16 means no match,
  1855  	// CX > R9 means partial match at the end of the string,
  1856  	// otherwise sep is at offset CX from X1 start
  1857  	CMPQ CX, R9
  1858  	JBE sse42_success
  1859  	ADDQ R9, DI
  1860  	CMPQ DI, SI
  1861  	JB loop_sse42
  1862  	PCMPESTRI $0x0c, -1(SI), X1
  1863  	CMPQ CX, R9
  1864  	JA fail
  1865  	LEAQ -1(SI), DI
  1866  sse42_success:
  1867  	ADDQ CX, DI
  1868  success:
  1869  	SUBQ s+0(FP), DI
  1870  	MOVQ DI, ret+32(FP)
  1871  	RET
  1872  
  1873  
  1874  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1875  	MOVQ s+0(FP), SI
  1876  	MOVQ s_len+8(FP), BX
  1877  	MOVB c+24(FP), AL
  1878  	LEAQ ret+32(FP), R8
  1879  	JMP  runtime·indexbytebody(SB)
  1880  
  1881  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1882  	MOVQ s+0(FP), SI
  1883  	MOVQ s_len+8(FP), BX
  1884  	MOVB c+16(FP), AL
  1885  	LEAQ ret+24(FP), R8
  1886  	JMP  runtime·indexbytebody(SB)
  1887  
  1888  // input:
  1889  //   SI: data
  1890  //   BX: data len
  1891  //   AL: byte sought
  1892  //   R8: address to put result
  1893  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1894  	// Shuffle X0 around so that each byte contains
  1895  	// the character we're looking for.
  1896  	MOVD AX, X0
  1897  	PUNPCKLBW X0, X0
  1898  	PUNPCKLBW X0, X0
  1899  	PSHUFL $0, X0, X0
  1900  	
  1901  	CMPQ BX, $16
  1902  	JLT small
  1903  
  1904  	MOVQ SI, DI
  1905  
  1906  	CMPQ BX, $32
  1907  	JA avx2
  1908  sse:
  1909  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  1910  	JMP	sseloopentry
  1911  	
  1912  sseloop:
  1913  	// Move the next 16-byte chunk of the data into X1.
  1914  	MOVOU	(DI), X1
  1915  	// Compare bytes in X0 to X1.
  1916  	PCMPEQB	X0, X1
  1917  	// Take the top bit of each byte in X1 and put the result in DX.
  1918  	PMOVMSKB X1, DX
  1919  	// Find first set bit, if any.
  1920  	BSFL	DX, DX
  1921  	JNZ	ssesuccess
  1922  	// Advance to next block.
  1923  	ADDQ	$16, DI
  1924  sseloopentry:
  1925  	CMPQ	DI, AX
  1926  	JB	sseloop
  1927  
  1928  	// Search the last 16-byte chunk. This chunk may overlap with the
  1929  	// chunks we've already searched, but that's ok.
  1930  	MOVQ	AX, DI
  1931  	MOVOU	(AX), X1
  1932  	PCMPEQB	X0, X1
  1933  	PMOVMSKB X1, DX
  1934  	BSFL	DX, DX
  1935  	JNZ	ssesuccess
  1936  
  1937  failure:
  1938  	MOVQ $-1, (R8)
  1939  	RET
  1940  
  1941  // We've found a chunk containing the byte.
  1942  // The chunk was loaded from DI.
  1943  // The index of the matching byte in the chunk is DX.
  1944  // The start of the data is SI.
  1945  ssesuccess:
  1946  	SUBQ SI, DI	// Compute offset of chunk within data.
  1947  	ADDQ DX, DI	// Add offset of byte within chunk.
  1948  	MOVQ DI, (R8)
  1949  	RET
  1950  
  1951  // handle for lengths < 16
  1952  small:
  1953  	TESTQ	BX, BX
  1954  	JEQ	failure
  1955  
  1956  	// Check if we'll load across a page boundary.
  1957  	LEAQ	16(SI), AX
  1958  	TESTW	$0xff0, AX
  1959  	JEQ	endofpage
  1960  
  1961  	MOVOU	(SI), X1 // Load data
  1962  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  1963  	PMOVMSKB X1, DX	// Move result bits to integer register.
  1964  	BSFL	DX, DX	// Find first set bit.
  1965  	JZ	failure	// No set bit, failure.
  1966  	CMPL	DX, BX
  1967  	JAE	failure	// Match is past end of data.
  1968  	MOVQ	DX, (R8)
  1969  	RET
  1970  
  1971  endofpage:
  1972  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  1973  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  1974  	PMOVMSKB X1, DX	// Move result bits to integer register.
  1975  	MOVL	BX, CX
  1976  	SHLL	CX, DX
  1977  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
  1978  	BSFL	DX, DX	// Find first set bit.
  1979  	JZ	failure	// No set bit, failure.
  1980  	MOVQ	DX, (R8)
  1981  	RET
  1982  
  1983  avx2:
  1984  	CMPB   runtime·support_avx2(SB), $1
  1985  	JNE sse
  1986  	MOVD AX, X0
  1987  	LEAQ -32(SI)(BX*1), R11
  1988  	VPBROADCASTB  X0, Y1
  1989  avx2_loop:
  1990  	VMOVDQU (DI), Y2
  1991  	VPCMPEQB Y1, Y2, Y3
  1992  	VPTEST Y3, Y3
  1993  	JNZ avx2success
  1994  	ADDQ $32, DI
  1995  	CMPQ DI, R11
  1996  	JLT avx2_loop
  1997  	MOVQ R11, DI
  1998  	VMOVDQU (DI), Y2
  1999  	VPCMPEQB Y1, Y2, Y3
  2000  	VPTEST Y3, Y3
  2001  	JNZ avx2success
  2002  	VZEROUPPER
  2003  	MOVQ $-1, (R8)
  2004  	RET
  2005  
  2006  avx2success:
  2007  	VPMOVMSKB Y3, DX
  2008  	BSFL DX, DX
  2009  	SUBQ SI, DI
  2010  	ADDQ DI, DX
  2011  	MOVQ DX, (R8)
  2012  	VZEROUPPER
  2013  	RET
  2014  
  2015  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2016  	MOVQ	a_len+8(FP), BX
  2017  	MOVQ	b_len+32(FP), CX
  2018  	CMPQ	BX, CX
  2019  	JNE	eqret
  2020  	MOVQ	a+0(FP), SI
  2021  	MOVQ	b+24(FP), DI
  2022  	LEAQ	ret+48(FP), AX
  2023  	JMP	runtime·memeqbody(SB)
  2024  eqret:
  2025  	MOVB	$0, ret+48(FP)
  2026  	RET
  2027  
  2028  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  2029  	get_tls(CX)
  2030  	MOVQ	g(CX), AX
  2031  	MOVQ	g_m(AX), AX
  2032  	MOVL	m_fastrand(AX), DX
  2033  	ADDL	DX, DX
  2034  	MOVL	DX, BX
  2035  	XORL	$0x88888eef, DX
  2036  	CMOVLMI	BX, DX
  2037  	MOVL	DX, m_fastrand(AX)
  2038  	MOVL	DX, ret+0(FP)
  2039  	RET
  2040  
  2041  TEXT runtime·return0(SB), NOSPLIT, $0
  2042  	MOVL	$0, AX
  2043  	RET
  2044  
  2045  
  2046  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2047  // Must obey the gcc calling convention.
  2048  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2049  	get_tls(CX)
  2050  	MOVQ	g(CX), AX
  2051  	MOVQ	g_m(AX), AX
  2052  	MOVQ	m_curg(AX), AX
  2053  	MOVQ	(g_stack+stack_hi)(AX), AX
  2054  	RET
  2055  
  2056  // The top-most function running on a goroutine
  2057  // returns to goexit+PCQuantum.
  2058  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2059  	BYTE	$0x90	// NOP
  2060  	CALL	runtime·goexit1(SB)	// does not return
  2061  	// traceback from goexit1 must hit code range of goexit
  2062  	BYTE	$0x90	// NOP
  2063  
  2064  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2065  	MOVQ	addr+0(FP), AX
  2066  	PREFETCHT0	(AX)
  2067  	RET
  2068  
  2069  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2070  	MOVQ	addr+0(FP), AX
  2071  	PREFETCHT1	(AX)
  2072  	RET
  2073  
  2074  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2075  	MOVQ	addr+0(FP), AX
  2076  	PREFETCHT2	(AX)
  2077  	RET
  2078  
  2079  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2080  	MOVQ	addr+0(FP), AX
  2081  	PREFETCHNTA	(AX)
  2082  	RET
  2083  
  2084  // This is called from .init_array and follows the platform, not Go, ABI.
  2085  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2086  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2087  	MOVQ	runtime·lastmoduledatap(SB), AX
  2088  	MOVQ	DI, moduledata_next(AX)
  2089  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2090  	POPQ	R15
  2091  	RET