github.com/fjballest/golang@v0.0.0-20151209143359-e4c5fe594ca8/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	CMPQ	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  	// Do nothing.
    46  
    47  	MOVQ	$1, AX
    48  	CPUID
    49  	MOVL	CX, runtime·cpuid_ecx(SB)
    50  	MOVL	DX, runtime·cpuid_edx(SB)
    51  	// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
    52  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
    53  	// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
    54  	ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
    55  	CMPL    CX, $0x18000000
    56  	JNE     noavx
    57  	MOVL    $0, CX
    58  	// For XGETBV, OSXSAVE bit is required and sufficient
    59  	BYTE $0x0F; BYTE $0x01; BYTE $0xD0
    60  	ANDL    $6, AX
    61  	CMPL    AX, $6 // Check for OS support of YMM registers
    62  	JNE     noavx
    63  	MOVB    $1, runtime·support_avx(SB)
    64  	MOVL    $7, AX
    65  	MOVL    $0, CX
    66  	CPUID
    67  	ANDL    $0x20, BX // check for AVX2 bit
    68  	CMPL    BX, $0x20
    69  	JNE     noavx2
    70  	MOVB    $1, runtime·support_avx2(SB)
    71  	JMP     nocpuinfo
    72  noavx:
    73  	MOVB    $0, runtime·support_avx(SB)
    74  noavx2:
    75  	MOVB    $0, runtime·support_avx2(SB)
    76  nocpuinfo:	
    77  	
    78  	// if there is an _cgo_init, call it.
    79  	MOVQ	_cgo_init(SB), AX
    80  	TESTQ	AX, AX
    81  	JZ	needtls
    82  	// g0 already in DI
    83  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    84  	MOVQ	$setg_gcc<>(SB), SI
    85  	CALL	AX
    86  
    87  	// update stackguard after _cgo_init
    88  	MOVQ	$runtime·g0(SB), CX
    89  	MOVQ	(g_stack+stack_lo)(CX), AX
    90  	ADDQ	$const__StackGuard, AX
    91  	MOVQ	AX, g_stackguard0(CX)
    92  	MOVQ	AX, g_stackguard1(CX)
    93  
    94  #ifndef GOOS_windows
    95  	JMP ok
    96  #endif
    97  needtls:
    98  #ifdef GOOS_plan9
    99  	// skip TLS setup on Plan 9
   100  	JMP ok
   101  #endif
   102  #ifdef GOOS_solaris
   103  	// skip TLS setup on Solaris
   104  	JMP ok
   105  #endif
   106  
   107  	LEAQ	runtime·m0+m_tls(SB), DI
   108  	CALL	runtime·settls(SB)
   109  
   110  	// store through it, to make sure it works
   111  	get_tls(BX)
   112  	MOVQ	$0x123, g(BX)
   113  	MOVQ	runtime·m0+m_tls(SB), AX
   114  	CMPQ	AX, $0x123
   115  	JEQ 2(PC)
   116  	MOVL	AX, 0	// abort
   117  ok:
   118  	// set the per-goroutine and per-mach "registers"
   119  	get_tls(BX)
   120  	LEAQ	runtime·g0(SB), CX
   121  	MOVQ	CX, g(BX)
   122  	LEAQ	runtime·m0(SB), AX
   123  
   124  	// save m->g0 = g0
   125  	MOVQ	CX, m_g0(AX)
   126  	// save m0 to g0->m
   127  	MOVQ	AX, g_m(CX)
   128  
   129  	CLD				// convention is D is always left cleared
   130  	CALL	runtime·check(SB)
   131  
   132  	MOVL	16(SP), AX		// copy argc
   133  	MOVL	AX, 0(SP)
   134  	MOVQ	24(SP), AX		// copy argv
   135  	MOVQ	AX, 8(SP)
   136  	CALL	runtime·args(SB)
   137  	CALL	runtime·osinit(SB)
   138  	CALL	runtime·schedinit(SB)
   139  
   140  	// create a new goroutine to start program
   141  	MOVQ	$runtime·mainPC(SB), AX		// entry
   142  	PUSHQ	AX
   143  	PUSHQ	$0			// arg size
   144  	CALL	runtime·newproc(SB)
   145  	POPQ	AX
   146  	POPQ	AX
   147  
   148  	// start this M
   149  	CALL	runtime·mstart(SB)
   150  
   151  	MOVL	$0xf1, 0xf1  // crash
   152  	RET
   153  
   154  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   155  GLOBL	runtime·mainPC(SB),RODATA,$8
   156  
   157  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   158  	BYTE	$0xcc
   159  	RET
   160  
   161  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   162  	// No per-thread init.
   163  	RET
   164  
   165  /*
   166   *  go-routine
   167   */
   168  
   169  // void gosave(Gobuf*)
   170  // save state in Gobuf; setjmp
   171  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   172  	MOVQ	buf+0(FP), AX		// gobuf
   173  	LEAQ	buf+0(FP), BX		// caller's SP
   174  	MOVQ	BX, gobuf_sp(AX)
   175  	MOVQ	0(SP), BX		// caller's PC
   176  	MOVQ	BX, gobuf_pc(AX)
   177  	MOVQ	$0, gobuf_ret(AX)
   178  	MOVQ	$0, gobuf_ctxt(AX)
   179  	MOVQ	BP, gobuf_bp(AX)
   180  	get_tls(CX)
   181  	MOVQ	g(CX), BX
   182  	MOVQ	BX, gobuf_g(AX)
   183  	RET
   184  
   185  // void gogo(Gobuf*)
   186  // restore state from Gobuf; longjmp
   187  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   188  	MOVQ	buf+0(FP), BX		// gobuf
   189  	MOVQ	gobuf_g(BX), DX
   190  	MOVQ	0(DX), CX		// make sure g != nil
   191  	get_tls(CX)
   192  	MOVQ	DX, g(CX)
   193  	MOVQ	gobuf_sp(BX), SP	// restore SP
   194  	MOVQ	gobuf_ret(BX), AX
   195  	MOVQ	gobuf_ctxt(BX), DX
   196  	MOVQ	gobuf_bp(BX), BP
   197  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   198  	MOVQ	$0, gobuf_ret(BX)
   199  	MOVQ	$0, gobuf_ctxt(BX)
   200  	MOVQ	$0, gobuf_bp(BX)
   201  	MOVQ	gobuf_pc(BX), BX
   202  	JMP	BX
   203  
   204  // func mcall(fn func(*g))
   205  // Switch to m->g0's stack, call fn(g).
   206  // Fn must never return.  It should gogo(&g->sched)
   207  // to keep running g.
   208  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   209  	MOVQ	fn+0(FP), DI
   210  	
   211  	get_tls(CX)
   212  	MOVQ	g(CX), AX	// save state in g->sched
   213  	MOVQ	0(SP), BX	// caller's PC
   214  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   215  	LEAQ	fn+0(FP), BX	// caller's SP
   216  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   217  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   218  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   219  
   220  	// switch to m->g0 & its stack, call fn
   221  	MOVQ	g(CX), BX
   222  	MOVQ	g_m(BX), BX
   223  	MOVQ	m_g0(BX), SI
   224  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   225  	JNE	3(PC)
   226  	MOVQ	$runtime·badmcall(SB), AX
   227  	JMP	AX
   228  	MOVQ	SI, g(CX)	// g = m->g0
   229  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   230  	PUSHQ	AX
   231  	MOVQ	DI, DX
   232  	MOVQ	0(DI), DI
   233  	CALL	DI
   234  	POPQ	AX
   235  	MOVQ	$runtime·badmcall2(SB), AX
   236  	JMP	AX
   237  	RET
   238  
   239  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   240  // of the G stack.  We need to distinguish the routine that
   241  // lives at the bottom of the G stack from the one that lives
   242  // at the top of the system stack because the one at the top of
   243  // the system stack terminates the stack walk (see topofstack()).
   244  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   245  	RET
   246  
   247  // func systemstack(fn func())
   248  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   249  	MOVQ	fn+0(FP), DI	// DI = fn
   250  	get_tls(CX)
   251  	MOVQ	g(CX), AX	// AX = g
   252  	MOVQ	g_m(AX), BX	// BX = m
   253  
   254  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   255  	CMPQ	AX, DX
   256  	JEQ	noswitch
   257  
   258  	MOVQ	m_g0(BX), DX	// DX = g0
   259  	CMPQ	AX, DX
   260  	JEQ	noswitch
   261  
   262  	MOVQ	m_curg(BX), R8
   263  	CMPQ	AX, R8
   264  	JEQ	switch
   265  	
   266  	// Bad: g is not gsignal, not g0, not curg. What is it?
   267  	MOVQ	$runtime·badsystemstack(SB), AX
   268  	CALL	AX
   269  
   270  switch:
   271  	// save our state in g->sched.  Pretend to
   272  	// be systemstack_switch if the G stack is scanned.
   273  	MOVQ	$runtime·systemstack_switch(SB), SI
   274  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   275  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   276  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   277  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   278  
   279  	// switch to g0
   280  	MOVQ	DX, g(CX)
   281  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   282  	// make it look like mstart called systemstack on g0, to stop traceback
   283  	SUBQ	$8, BX
   284  	MOVQ	$runtime·mstart(SB), DX
   285  	MOVQ	DX, 0(BX)
   286  	MOVQ	BX, SP
   287  
   288  	// call target function
   289  	MOVQ	DI, DX
   290  	MOVQ	0(DI), DI
   291  	CALL	DI
   292  
   293  	// switch back to g
   294  	get_tls(CX)
   295  	MOVQ	g(CX), AX
   296  	MOVQ	g_m(AX), BX
   297  	MOVQ	m_curg(BX), AX
   298  	MOVQ	AX, g(CX)
   299  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   300  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   301  	RET
   302  
   303  noswitch:
   304  	// already on m stack, just call directly
   305  	MOVQ	DI, DX
   306  	MOVQ	0(DI), DI
   307  	CALL	DI
   308  	RET
   309  
   310  /*
   311   * support for morestack
   312   */
   313  
   314  // Called during function prolog when more stack is needed.
   315  //
   316  // The traceback routines see morestack on a g0 as being
   317  // the top of a stack (for example, morestack calling newstack
   318  // calling the scheduler calling newm calling gc), so we must
   319  // record an argument size. For that purpose, it has no arguments.
   320  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   321  	// Cannot grow scheduler stack (m->g0).
   322  	get_tls(CX)
   323  	MOVQ	g(CX), BX
   324  	MOVQ	g_m(BX), BX
   325  	MOVQ	m_g0(BX), SI
   326  	CMPQ	g(CX), SI
   327  	JNE	2(PC)
   328  	INT	$3
   329  
   330  	// Cannot grow signal stack (m->gsignal).
   331  	MOVQ	m_gsignal(BX), SI
   332  	CMPQ	g(CX), SI
   333  	JNE	2(PC)
   334  	INT	$3
   335  
   336  	// Called from f.
   337  	// Set m->morebuf to f's caller.
   338  	MOVQ	8(SP), AX	// f's caller's PC
   339  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   340  	LEAQ	16(SP), AX	// f's caller's SP
   341  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   342  	get_tls(CX)
   343  	MOVQ	g(CX), SI
   344  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   345  
   346  	// Set g->sched to context in f.
   347  	MOVQ	0(SP), AX // f's PC
   348  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   349  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   350  	LEAQ	8(SP), AX // f's SP
   351  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   352  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   353  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   354  
   355  	// Call newstack on m->g0's stack.
   356  	MOVQ	m_g0(BX), BX
   357  	MOVQ	BX, g(CX)
   358  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   359  	CALL	runtime·newstack(SB)
   360  	MOVQ	$0, 0x1003	// crash if newstack returns
   361  	RET
   362  
   363  // morestack but not preserving ctxt.
   364  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   365  	MOVL	$0, DX
   366  	JMP	runtime·morestack(SB)
   367  
   368  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   369  	// We came here via a RET to an overwritten return PC.
   370  	// AX may be live. Other registers are available.
   371  
   372  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   373  	get_tls(CX)
   374  	MOVQ	g(CX), CX
   375  	MOVQ	(g_stkbar+slice_array)(CX), DX
   376  	MOVQ	g_stkbarPos(CX), BX
   377  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   378  	MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
   379  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   380  	// Assert that we're popping the right saved LR.
   381  	ADDQ	$8, R8
   382  	CMPQ	R8, SP
   383  	JEQ	2(PC)
   384  	MOVL	$0, 0
   385  	// Record that this stack barrier was hit.
   386  	ADDQ	$1, g_stkbarPos(CX)
   387  	// Jump to the original return PC.
   388  	JMP	BX
   389  
   390  // reflectcall: call a function with the given argument list
   391  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   392  // we don't have variable-sized frames, so we use a small number
   393  // of constant-sized-frame functions to encode a few bits of size in the pc.
   394  // Caution: ugly multiline assembly macros in your future!
   395  
   396  #define DISPATCH(NAME,MAXSIZE)		\
   397  	CMPQ	CX, $MAXSIZE;		\
   398  	JA	3(PC);			\
   399  	MOVQ	$NAME(SB), AX;		\
   400  	JMP	AX
   401  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   402  
   403  TEXT reflect·call(SB), NOSPLIT, $0-0
   404  	JMP	·reflectcall(SB)
   405  
   406  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   407  	MOVLQZX argsize+24(FP), CX
   408  	// NOTE(rsc): No call16, because CALLFN needs four words
   409  	// of argument space to invoke callwritebarrier.
   410  	DISPATCH(runtime·call32, 32)
   411  	DISPATCH(runtime·call64, 64)
   412  	DISPATCH(runtime·call128, 128)
   413  	DISPATCH(runtime·call256, 256)
   414  	DISPATCH(runtime·call512, 512)
   415  	DISPATCH(runtime·call1024, 1024)
   416  	DISPATCH(runtime·call2048, 2048)
   417  	DISPATCH(runtime·call4096, 4096)
   418  	DISPATCH(runtime·call8192, 8192)
   419  	DISPATCH(runtime·call16384, 16384)
   420  	DISPATCH(runtime·call32768, 32768)
   421  	DISPATCH(runtime·call65536, 65536)
   422  	DISPATCH(runtime·call131072, 131072)
   423  	DISPATCH(runtime·call262144, 262144)
   424  	DISPATCH(runtime·call524288, 524288)
   425  	DISPATCH(runtime·call1048576, 1048576)
   426  	DISPATCH(runtime·call2097152, 2097152)
   427  	DISPATCH(runtime·call4194304, 4194304)
   428  	DISPATCH(runtime·call8388608, 8388608)
   429  	DISPATCH(runtime·call16777216, 16777216)
   430  	DISPATCH(runtime·call33554432, 33554432)
   431  	DISPATCH(runtime·call67108864, 67108864)
   432  	DISPATCH(runtime·call134217728, 134217728)
   433  	DISPATCH(runtime·call268435456, 268435456)
   434  	DISPATCH(runtime·call536870912, 536870912)
   435  	DISPATCH(runtime·call1073741824, 1073741824)
   436  	MOVQ	$runtime·badreflectcall(SB), AX
   437  	JMP	AX
   438  
   439  #define CALLFN(NAME,MAXSIZE)			\
   440  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   441  	NO_LOCAL_POINTERS;			\
   442  	/* copy arguments to stack */		\
   443  	MOVQ	argptr+16(FP), SI;		\
   444  	MOVLQZX argsize+24(FP), CX;		\
   445  	MOVQ	SP, DI;				\
   446  	REP;MOVSB;				\
   447  	/* call function */			\
   448  	MOVQ	f+8(FP), DX;			\
   449  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   450  	CALL	(DX);				\
   451  	/* copy return values back */		\
   452  	MOVQ	argptr+16(FP), DI;		\
   453  	MOVLQZX	argsize+24(FP), CX;		\
   454  	MOVLQZX retoffset+28(FP), BX;		\
   455  	MOVQ	SP, SI;				\
   456  	ADDQ	BX, DI;				\
   457  	ADDQ	BX, SI;				\
   458  	SUBQ	BX, CX;				\
   459  	REP;MOVSB;				\
   460  	/* execute write barrier updates */	\
   461  	MOVQ	argtype+0(FP), DX;		\
   462  	MOVQ	argptr+16(FP), DI;		\
   463  	MOVLQZX	argsize+24(FP), CX;		\
   464  	MOVLQZX retoffset+28(FP), BX;		\
   465  	MOVQ	DX, 0(SP);			\
   466  	MOVQ	DI, 8(SP);			\
   467  	MOVQ	CX, 16(SP);			\
   468  	MOVQ	BX, 24(SP);			\
   469  	CALL	runtime·callwritebarrier(SB);	\
   470  	RET
   471  
   472  CALLFN(·call32, 32)
   473  CALLFN(·call64, 64)
   474  CALLFN(·call128, 128)
   475  CALLFN(·call256, 256)
   476  CALLFN(·call512, 512)
   477  CALLFN(·call1024, 1024)
   478  CALLFN(·call2048, 2048)
   479  CALLFN(·call4096, 4096)
   480  CALLFN(·call8192, 8192)
   481  CALLFN(·call16384, 16384)
   482  CALLFN(·call32768, 32768)
   483  CALLFN(·call65536, 65536)
   484  CALLFN(·call131072, 131072)
   485  CALLFN(·call262144, 262144)
   486  CALLFN(·call524288, 524288)
   487  CALLFN(·call1048576, 1048576)
   488  CALLFN(·call2097152, 2097152)
   489  CALLFN(·call4194304, 4194304)
   490  CALLFN(·call8388608, 8388608)
   491  CALLFN(·call16777216, 16777216)
   492  CALLFN(·call33554432, 33554432)
   493  CALLFN(·call67108864, 67108864)
   494  CALLFN(·call134217728, 134217728)
   495  CALLFN(·call268435456, 268435456)
   496  CALLFN(·call536870912, 536870912)
   497  CALLFN(·call1073741824, 1073741824)
   498  
   499  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   500  	MOVL	cycles+0(FP), AX
   501  again:
   502  	PAUSE
   503  	SUBL	$1, AX
   504  	JNZ	again
   505  	RET
   506  
   507  
   508  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   509  	// Stores are already ordered on x86, so this is just a
   510  	// compile barrier.
   511  	RET
   512  
   513  // void jmpdefer(fn, sp);
   514  // called from deferreturn.
   515  // 1. pop the caller
   516  // 2. sub 5 bytes from the callers return
   517  // 3. jmp to the argument
   518  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   519  	MOVQ	fv+0(FP), DX	// fn
   520  	MOVQ	argp+8(FP), BX	// caller sp
   521  	LEAQ	-8(BX), SP	// caller sp after CALL
   522  	SUBQ	$5, (SP)	// return to CALL again
   523  	MOVQ	0(DX), BX
   524  	JMP	BX	// but first run the deferred function
   525  
   526  // Save state of caller into g->sched. Smashes R8, R9.
   527  TEXT gosave<>(SB),NOSPLIT,$0
   528  	get_tls(R8)
   529  	MOVQ	g(R8), R8
   530  	MOVQ	0(SP), R9
   531  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   532  	LEAQ	8(SP), R9
   533  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   534  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   535  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   536  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   537  	RET
   538  
   539  // func asmcgocall(fn, arg unsafe.Pointer) int32
   540  // Call fn(arg) on the scheduler stack,
   541  // aligned appropriately for the gcc ABI.
   542  // See cgocall.go for more details.
   543  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   544  	MOVQ	fn+0(FP), AX
   545  	MOVQ	arg+8(FP), BX
   546  
   547  	MOVQ	SP, DX
   548  
   549  	// Figure out if we need to switch to m->g0 stack.
   550  	// We get called to create new OS threads too, and those
   551  	// come in on the m->g0 stack already.
   552  	get_tls(CX)
   553  	MOVQ	g(CX), R8
   554  	CMPQ	R8, $0
   555  	JEQ	nosave
   556  	MOVQ	g_m(R8), R8
   557  	MOVQ	m_g0(R8), SI
   558  	MOVQ	g(CX), DI
   559  	CMPQ	SI, DI
   560  	JEQ	nosave
   561  	MOVQ	m_gsignal(R8), SI
   562  	CMPQ	SI, DI
   563  	JEQ	nosave
   564  	
   565  	// Switch to system stack.
   566  	MOVQ	m_g0(R8), SI
   567  	CALL	gosave<>(SB)
   568  	MOVQ	SI, g(CX)
   569  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   570  
   571  	// Now on a scheduling stack (a pthread-created stack).
   572  	// Make sure we have enough room for 4 stack-backed fast-call
   573  	// registers as per windows amd64 calling convention.
   574  	SUBQ	$64, SP
   575  	ANDQ	$~15, SP	// alignment for gcc ABI
   576  	MOVQ	DI, 48(SP)	// save g
   577  	MOVQ	(g_stack+stack_hi)(DI), DI
   578  	SUBQ	DX, DI
   579  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   580  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   581  	MOVQ	BX, CX		// CX = first argument in Win64
   582  	CALL	AX
   583  
   584  	// Restore registers, g, stack pointer.
   585  	get_tls(CX)
   586  	MOVQ	48(SP), DI
   587  	MOVQ	(g_stack+stack_hi)(DI), SI
   588  	SUBQ	40(SP), SI
   589  	MOVQ	DI, g(CX)
   590  	MOVQ	SI, SP
   591  
   592  	MOVL	AX, ret+16(FP)
   593  	RET
   594  
   595  nosave:
   596  	// Running on a system stack, perhaps even without a g.
   597  	// Having no g can happen during thread creation or thread teardown
   598  	// (see needm/dropm on Solaris, for example).
   599  	// This code is like the above sequence but without saving/restoring g
   600  	// and without worrying about the stack moving out from under us
   601  	// (because we're on a system stack, not a goroutine stack).
   602  	// The above code could be used directly if already on a system stack,
   603  	// but then the only path through this code would be a rare case on Solaris.
   604  	// Using this code for all "already on system stack" calls exercises it more,
   605  	// which should help keep it correct.
   606  	SUBQ	$64, SP
   607  	ANDQ	$~15, SP
   608  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   609  	MOVQ	DX, 40(SP)	// save original stack pointer
   610  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   611  	MOVQ	BX, CX		// CX = first argument in Win64
   612  	CALL	AX
   613  	MOVQ	40(SP), SI	// restore original stack pointer
   614  	MOVQ	SI, SP
   615  	MOVL	AX, ret+16(FP)
   616  	RET
   617  
   618  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   619  // Turn the fn into a Go func (by taking its address) and call
   620  // cgocallback_gofunc.
   621  TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
   622  	LEAQ	fn+0(FP), AX
   623  	MOVQ	AX, 0(SP)
   624  	MOVQ	frame+8(FP), AX
   625  	MOVQ	AX, 8(SP)
   626  	MOVQ	framesize+16(FP), AX
   627  	MOVQ	AX, 16(SP)
   628  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   629  	CALL	AX
   630  	RET
   631  
   632  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   633  // See cgocall.go for more details.
   634  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
   635  	NO_LOCAL_POINTERS
   636  
   637  	// If g is nil, Go did not create the current thread.
   638  	// Call needm to obtain one m for temporary use.
   639  	// In this case, we're running on the thread stack, so there's
   640  	// lots of space, but the linker doesn't know. Hide the call from
   641  	// the linker analysis by using an indirect call through AX.
   642  	get_tls(CX)
   643  #ifdef GOOS_windows
   644  	MOVL	$0, BX
   645  	CMPQ	CX, $0
   646  	JEQ	2(PC)
   647  #endif
   648  	MOVQ	g(CX), BX
   649  	CMPQ	BX, $0
   650  	JEQ	needm
   651  	MOVQ	g_m(BX), BX
   652  	MOVQ	BX, R8 // holds oldm until end of function
   653  	JMP	havem
   654  needm:
   655  	MOVQ	$0, 0(SP)
   656  	MOVQ	$runtime·needm(SB), AX
   657  	CALL	AX
   658  	MOVQ	0(SP), R8
   659  	get_tls(CX)
   660  	MOVQ	g(CX), BX
   661  	MOVQ	g_m(BX), BX
   662  	
   663  	// Set m->sched.sp = SP, so that if a panic happens
   664  	// during the function we are about to execute, it will
   665  	// have a valid SP to run on the g0 stack.
   666  	// The next few lines (after the havem label)
   667  	// will save this SP onto the stack and then write
   668  	// the same SP back to m->sched.sp. That seems redundant,
   669  	// but if an unrecovered panic happens, unwindm will
   670  	// restore the g->sched.sp from the stack location
   671  	// and then systemstack will try to use it. If we don't set it here,
   672  	// that restored SP will be uninitialized (typically 0) and
   673  	// will not be usable.
   674  	MOVQ	m_g0(BX), SI
   675  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   676  
   677  havem:
   678  	// Now there's a valid m, and we're running on its m->g0.
   679  	// Save current m->g0->sched.sp on stack and then set it to SP.
   680  	// Save current sp in m->g0->sched.sp in preparation for
   681  	// switch back to m->curg stack.
   682  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   683  	MOVQ	m_g0(BX), SI
   684  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   685  	MOVQ	AX, 0(SP)
   686  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   687  
   688  	// Switch to m->curg stack and call runtime.cgocallbackg.
   689  	// Because we are taking over the execution of m->curg
   690  	// but *not* resuming what had been running, we need to
   691  	// save that information (m->curg->sched) so we can restore it.
   692  	// We can restore m->curg->sched.sp easily, because calling
   693  	// runtime.cgocallbackg leaves SP unchanged upon return.
   694  	// To save m->curg->sched.pc, we push it onto the stack.
   695  	// This has the added benefit that it looks to the traceback
   696  	// routine like cgocallbackg is going to return to that
   697  	// PC (because the frame we allocate below has the same
   698  	// size as cgocallback_gofunc's frame declared above)
   699  	// so that the traceback will seamlessly trace back into
   700  	// the earlier calls.
   701  	//
   702  	// In the new goroutine, 0(SP) holds the saved R8.
   703  	MOVQ	m_curg(BX), SI
   704  	MOVQ	SI, g(CX)
   705  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   706  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   707  	MOVQ	BX, -8(DI)
   708  	// Compute the size of the frame, including return PC and, if
   709  	// GOEXPERIMENT=framepointer, the saved based pointer
   710  	LEAQ	fv+0(FP), AX
   711  	SUBQ	SP, AX
   712  	SUBQ	AX, DI
   713  	MOVQ	DI, SP
   714  
   715  	MOVQ	R8, 0(SP)
   716  	CALL	runtime·cgocallbackg(SB)
   717  	MOVQ	0(SP), R8
   718  
   719  	// Compute the size of the frame again.  FP and SP have
   720  	// completely different values here than they did above,
   721  	// but only their difference matters.
   722  	LEAQ	fv+0(FP), AX
   723  	SUBQ	SP, AX
   724  
   725  	// Restore g->sched (== m->curg->sched) from saved values.
   726  	get_tls(CX)
   727  	MOVQ	g(CX), SI
   728  	MOVQ	SP, DI
   729  	ADDQ	AX, DI
   730  	MOVQ	-8(DI), BX
   731  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   732  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   733  
   734  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   735  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   736  	// so we do not have to restore it.)
   737  	MOVQ	g(CX), BX
   738  	MOVQ	g_m(BX), BX
   739  	MOVQ	m_g0(BX), SI
   740  	MOVQ	SI, g(CX)
   741  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   742  	MOVQ	0(SP), AX
   743  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   744  	
   745  	// If the m on entry was nil, we called needm above to borrow an m
   746  	// for the duration of the call. Since the call is over, return it with dropm.
   747  	CMPQ	R8, $0
   748  	JNE 3(PC)
   749  	MOVQ	$runtime·dropm(SB), AX
   750  	CALL	AX
   751  
   752  	// Done!
   753  	RET
   754  
   755  // void setg(G*); set g. for use by needm.
   756  TEXT runtime·setg(SB), NOSPLIT, $0-8
   757  	MOVQ	gg+0(FP), BX
   758  #ifdef GOOS_windows
   759  	CMPQ	BX, $0
   760  	JNE	settls
   761  	MOVQ	$0, 0x28(GS)
   762  	RET
   763  settls:
   764  	MOVQ	g_m(BX), AX
   765  	LEAQ	m_tls(AX), AX
   766  	MOVQ	AX, 0x28(GS)
   767  #endif
   768  	get_tls(CX)
   769  	MOVQ	BX, g(CX)
   770  	RET
   771  
   772  // void setg_gcc(G*); set g called from gcc.
   773  TEXT setg_gcc<>(SB),NOSPLIT,$0
   774  	get_tls(AX)
   775  	MOVQ	DI, g(AX)
   776  	RET
   777  
   778  // check that SP is in range [g->stack.lo, g->stack.hi)
   779  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   780  	get_tls(CX)
   781  	MOVQ	g(CX), AX
   782  	CMPQ	(g_stack+stack_hi)(AX), SP
   783  	JHI	2(PC)
   784  	INT	$3
   785  	CMPQ	SP, (g_stack+stack_lo)(AX)
   786  	JHI	2(PC)
   787  	INT	$3
   788  	RET
   789  
   790  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   791  	MOVQ	argp+0(FP),AX		// addr of first arg
   792  	MOVQ	-8(AX),AX		// get calling pc
   793  	CMPQ	AX, runtime·stackBarrierPC(SB)
   794  	JNE	nobar
   795  	// Get original return PC.
   796  	CALL	runtime·nextBarrierPC(SB)
   797  	MOVQ	0(SP), AX
   798  nobar:
   799  	MOVQ	AX, ret+8(FP)
   800  	RET
   801  
   802  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   803  	MOVQ	argp+0(FP),AX		// addr of first arg
   804  	MOVQ	pc+8(FP), BX
   805  	MOVQ	-8(AX), CX
   806  	CMPQ	CX, runtime·stackBarrierPC(SB)
   807  	JEQ	setbar
   808  	MOVQ	BX, -8(AX)		// set calling pc
   809  	RET
   810  setbar:
   811  	// Set the stack barrier return PC.
   812  	MOVQ	BX, 0(SP)
   813  	CALL	runtime·setNextBarrierPC(SB)
   814  	RET
   815  
   816  TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   817  	MOVQ	argp+0(FP), AX
   818  	MOVQ	AX, ret+8(FP)
   819  	RET
   820  
   821  // func cputicks() int64
   822  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   823  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   824  	JNE	mfence
   825  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   826  	JMP	done
   827  mfence:
   828  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   829  done:
   830  	RDTSC
   831  	SHLQ	$32, DX
   832  	ADDQ	DX, AX
   833  	MOVQ	AX, ret+0(FP)
   834  	RET
   835  
   836  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   837  // redirects to memhash(p, h, size) using the size
   838  // stored in the closure.
   839  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   840  	GO_ARGS
   841  	NO_LOCAL_POINTERS
   842  	MOVQ	p+0(FP), AX
   843  	MOVQ	h+8(FP), BX
   844  	MOVQ	8(DX), CX
   845  	MOVQ	AX, 0(SP)
   846  	MOVQ	BX, 8(SP)
   847  	MOVQ	CX, 16(SP)
   848  	CALL	runtime·memhash(SB)
   849  	MOVQ	24(SP), AX
   850  	MOVQ	AX, ret+16(FP)
   851  	RET
   852  
   853  // hash function using AES hardware instructions
   854  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   855  	MOVQ	p+0(FP), AX	// ptr to data
   856  	MOVQ	s+16(FP), CX	// size
   857  	LEAQ	ret+24(FP), DX
   858  	JMP	runtime·aeshashbody(SB)
   859  
   860  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   861  	MOVQ	p+0(FP), AX	// ptr to string struct
   862  	MOVQ	8(AX), CX	// length of string
   863  	MOVQ	(AX), AX	// string data
   864  	LEAQ	ret+16(FP), DX
   865  	JMP	runtime·aeshashbody(SB)
   866  
   867  // AX: data
   868  // CX: length
   869  // DX: address to put return value
   870  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   871  	// Fill an SSE register with our seeds.
   872  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   873  	PINSRW	$4, CX, X0			// 16 bits of length
   874  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   875  	MOVO	X0, X1				// save unscrambled seed
   876  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   877  	AESENC	X0, X0				// scramble seed
   878  
   879  	CMPQ	CX, $16
   880  	JB	aes0to15
   881  	JE	aes16
   882  	CMPQ	CX, $32
   883  	JBE	aes17to32
   884  	CMPQ	CX, $64
   885  	JBE	aes33to64
   886  	CMPQ	CX, $128
   887  	JBE	aes65to128
   888  	JMP	aes129plus
   889  
   890  aes0to15:
   891  	TESTQ	CX, CX
   892  	JE	aes0
   893  
   894  	ADDQ	$16, AX
   895  	TESTW	$0xff0, AX
   896  	JE	endofpage
   897  
   898  	// 16 bytes loaded at this address won't cross
   899  	// a page boundary, so we can load it directly.
   900  	MOVOU	-16(AX), X1
   901  	ADDQ	CX, CX
   902  	MOVQ	$masks<>(SB), AX
   903  	PAND	(AX)(CX*8), X1
   904  final1:
   905  	AESENC	X0, X1	// scramble input, xor in seed
   906  	AESENC	X1, X1  // scramble combo 2 times
   907  	AESENC	X1, X1
   908  	MOVQ	X1, (DX)
   909  	RET
   910  
   911  endofpage:
   912  	// address ends in 1111xxxx.  Might be up against
   913  	// a page boundary, so load ending at last byte.
   914  	// Then shift bytes down using pshufb.
   915  	MOVOU	-32(AX)(CX*1), X1
   916  	ADDQ	CX, CX
   917  	MOVQ	$shifts<>(SB), AX
   918  	PSHUFB	(AX)(CX*8), X1
   919  	JMP	final1
   920  
   921  aes0:
   922  	// Return scrambled input seed
   923  	AESENC	X0, X0
   924  	MOVQ	X0, (DX)
   925  	RET
   926  
   927  aes16:
   928  	MOVOU	(AX), X1
   929  	JMP	final1
   930  
   931  aes17to32:
   932  	// make second starting seed
   933  	PXOR	runtime·aeskeysched+16(SB), X1
   934  	AESENC	X1, X1
   935  	
   936  	// load data to be hashed
   937  	MOVOU	(AX), X2
   938  	MOVOU	-16(AX)(CX*1), X3
   939  
   940  	// scramble 3 times
   941  	AESENC	X0, X2
   942  	AESENC	X1, X3
   943  	AESENC	X2, X2
   944  	AESENC	X3, X3
   945  	AESENC	X2, X2
   946  	AESENC	X3, X3
   947  
   948  	// combine results
   949  	PXOR	X3, X2
   950  	MOVQ	X2, (DX)
   951  	RET
   952  
   953  aes33to64:
   954  	// make 3 more starting seeds
   955  	MOVO	X1, X2
   956  	MOVO	X1, X3
   957  	PXOR	runtime·aeskeysched+16(SB), X1
   958  	PXOR	runtime·aeskeysched+32(SB), X2
   959  	PXOR	runtime·aeskeysched+48(SB), X3
   960  	AESENC	X1, X1
   961  	AESENC	X2, X2
   962  	AESENC	X3, X3
   963  	
   964  	MOVOU	(AX), X4
   965  	MOVOU	16(AX), X5
   966  	MOVOU	-32(AX)(CX*1), X6
   967  	MOVOU	-16(AX)(CX*1), X7
   968  	
   969  	AESENC	X0, X4
   970  	AESENC	X1, X5
   971  	AESENC	X2, X6
   972  	AESENC	X3, X7
   973  	
   974  	AESENC	X4, X4
   975  	AESENC	X5, X5
   976  	AESENC	X6, X6
   977  	AESENC	X7, X7
   978  	
   979  	AESENC	X4, X4
   980  	AESENC	X5, X5
   981  	AESENC	X6, X6
   982  	AESENC	X7, X7
   983  
   984  	PXOR	X6, X4
   985  	PXOR	X7, X5
   986  	PXOR	X5, X4
   987  	MOVQ	X4, (DX)
   988  	RET
   989  
   990  aes65to128:
   991  	// make 7 more starting seeds
   992  	MOVO	X1, X2
   993  	MOVO	X1, X3
   994  	MOVO	X1, X4
   995  	MOVO	X1, X5
   996  	MOVO	X1, X6
   997  	MOVO	X1, X7
   998  	PXOR	runtime·aeskeysched+16(SB), X1
   999  	PXOR	runtime·aeskeysched+32(SB), X2
  1000  	PXOR	runtime·aeskeysched+48(SB), X3
  1001  	PXOR	runtime·aeskeysched+64(SB), X4
  1002  	PXOR	runtime·aeskeysched+80(SB), X5
  1003  	PXOR	runtime·aeskeysched+96(SB), X6
  1004  	PXOR	runtime·aeskeysched+112(SB), X7
  1005  	AESENC	X1, X1
  1006  	AESENC	X2, X2
  1007  	AESENC	X3, X3
  1008  	AESENC	X4, X4
  1009  	AESENC	X5, X5
  1010  	AESENC	X6, X6
  1011  	AESENC	X7, X7
  1012  
  1013  	// load data
  1014  	MOVOU	(AX), X8
  1015  	MOVOU	16(AX), X9
  1016  	MOVOU	32(AX), X10
  1017  	MOVOU	48(AX), X11
  1018  	MOVOU	-64(AX)(CX*1), X12
  1019  	MOVOU	-48(AX)(CX*1), X13
  1020  	MOVOU	-32(AX)(CX*1), X14
  1021  	MOVOU	-16(AX)(CX*1), X15
  1022  
  1023  	// scramble data, xor in seed
  1024  	AESENC	X0, X8
  1025  	AESENC	X1, X9
  1026  	AESENC	X2, X10
  1027  	AESENC	X3, X11
  1028  	AESENC	X4, X12
  1029  	AESENC	X5, X13
  1030  	AESENC	X6, X14
  1031  	AESENC	X7, X15
  1032  
  1033  	// scramble twice
  1034  	AESENC	X8, X8
  1035  	AESENC	X9, X9
  1036  	AESENC	X10, X10
  1037  	AESENC	X11, X11
  1038  	AESENC	X12, X12
  1039  	AESENC	X13, X13
  1040  	AESENC	X14, X14
  1041  	AESENC	X15, X15
  1042  	
  1043  	AESENC	X8, X8
  1044  	AESENC	X9, X9
  1045  	AESENC	X10, X10
  1046  	AESENC	X11, X11
  1047  	AESENC	X12, X12
  1048  	AESENC	X13, X13
  1049  	AESENC	X14, X14
  1050  	AESENC	X15, X15
  1051  
  1052  	// combine results
  1053  	PXOR	X12, X8
  1054  	PXOR	X13, X9
  1055  	PXOR	X14, X10
  1056  	PXOR	X15, X11
  1057  	PXOR	X10, X8
  1058  	PXOR	X11, X9
  1059  	PXOR	X9, X8
  1060  	MOVQ	X8, (DX)
  1061  	RET
  1062  
  1063  aes129plus:
  1064  	// make 7 more starting seeds
  1065  	MOVO	X1, X2
  1066  	MOVO	X1, X3
  1067  	MOVO	X1, X4
  1068  	MOVO	X1, X5
  1069  	MOVO	X1, X6
  1070  	MOVO	X1, X7
  1071  	PXOR	runtime·aeskeysched+16(SB), X1
  1072  	PXOR	runtime·aeskeysched+32(SB), X2
  1073  	PXOR	runtime·aeskeysched+48(SB), X3
  1074  	PXOR	runtime·aeskeysched+64(SB), X4
  1075  	PXOR	runtime·aeskeysched+80(SB), X5
  1076  	PXOR	runtime·aeskeysched+96(SB), X6
  1077  	PXOR	runtime·aeskeysched+112(SB), X7
  1078  	AESENC	X1, X1
  1079  	AESENC	X2, X2
  1080  	AESENC	X3, X3
  1081  	AESENC	X4, X4
  1082  	AESENC	X5, X5
  1083  	AESENC	X6, X6
  1084  	AESENC	X7, X7
  1085  	
  1086  	// start with last (possibly overlapping) block
  1087  	MOVOU	-128(AX)(CX*1), X8
  1088  	MOVOU	-112(AX)(CX*1), X9
  1089  	MOVOU	-96(AX)(CX*1), X10
  1090  	MOVOU	-80(AX)(CX*1), X11
  1091  	MOVOU	-64(AX)(CX*1), X12
  1092  	MOVOU	-48(AX)(CX*1), X13
  1093  	MOVOU	-32(AX)(CX*1), X14
  1094  	MOVOU	-16(AX)(CX*1), X15
  1095  
  1096  	// scramble input once, xor in seed
  1097  	AESENC	X0, X8
  1098  	AESENC	X1, X9
  1099  	AESENC	X2, X10
  1100  	AESENC	X3, X11
  1101  	AESENC	X4, X12
  1102  	AESENC	X5, X13
  1103  	AESENC	X6, X14
  1104  	AESENC	X7, X15
  1105  	
  1106  	// compute number of remaining 128-byte blocks
  1107  	DECQ	CX
  1108  	SHRQ	$7, CX
  1109  	
  1110  aesloop:
  1111  	// scramble state, xor in a block
  1112  	MOVOU	(AX), X0
  1113  	MOVOU	16(AX), X1
  1114  	MOVOU	32(AX), X2
  1115  	MOVOU	48(AX), X3
  1116  	AESENC	X0, X8
  1117  	AESENC	X1, X9
  1118  	AESENC	X2, X10
  1119  	AESENC	X3, X11
  1120  	MOVOU	64(AX), X4
  1121  	MOVOU	80(AX), X5
  1122  	MOVOU	96(AX), X6
  1123  	MOVOU	112(AX), X7
  1124  	AESENC	X4, X12
  1125  	AESENC	X5, X13
  1126  	AESENC	X6, X14
  1127  	AESENC	X7, X15
  1128  
  1129  	// scramble state
  1130  	AESENC	X8, X8
  1131  	AESENC	X9, X9
  1132  	AESENC	X10, X10
  1133  	AESENC	X11, X11
  1134  	AESENC	X12, X12
  1135  	AESENC	X13, X13
  1136  	AESENC	X14, X14
  1137  	AESENC	X15, X15
  1138  
  1139  	ADDQ	$128, AX
  1140  	DECQ	CX
  1141  	JNE	aesloop
  1142  
  1143  	// 2 more scrambles to finish
  1144  	AESENC	X8, X8
  1145  	AESENC	X9, X9
  1146  	AESENC	X10, X10
  1147  	AESENC	X11, X11
  1148  	AESENC	X12, X12
  1149  	AESENC	X13, X13
  1150  	AESENC	X14, X14
  1151  	AESENC	X15, X15
  1152  	AESENC	X8, X8
  1153  	AESENC	X9, X9
  1154  	AESENC	X10, X10
  1155  	AESENC	X11, X11
  1156  	AESENC	X12, X12
  1157  	AESENC	X13, X13
  1158  	AESENC	X14, X14
  1159  	AESENC	X15, X15
  1160  
  1161  	PXOR	X12, X8
  1162  	PXOR	X13, X9
  1163  	PXOR	X14, X10
  1164  	PXOR	X15, X11
  1165  	PXOR	X10, X8
  1166  	PXOR	X11, X9
  1167  	PXOR	X9, X8
  1168  	MOVQ	X8, (DX)
  1169  	RET
  1170  	
  1171  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1172  	MOVQ	p+0(FP), AX	// ptr to data
  1173  	MOVQ	h+8(FP), X0	// seed
  1174  	PINSRD	$2, (AX), X0	// data
  1175  	AESENC	runtime·aeskeysched+0(SB), X0
  1176  	AESENC	runtime·aeskeysched+16(SB), X0
  1177  	AESENC	runtime·aeskeysched+32(SB), X0
  1178  	MOVQ	X0, ret+16(FP)
  1179  	RET
  1180  
  1181  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1182  	MOVQ	p+0(FP), AX	// ptr to data
  1183  	MOVQ	h+8(FP), X0	// seed
  1184  	PINSRQ	$1, (AX), X0	// data
  1185  	AESENC	runtime·aeskeysched+0(SB), X0
  1186  	AESENC	runtime·aeskeysched+16(SB), X0
  1187  	AESENC	runtime·aeskeysched+32(SB), X0
  1188  	MOVQ	X0, ret+16(FP)
  1189  	RET
  1190  
  1191  // simple mask to get rid of data in the high part of the register.
  1192  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1193  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1194  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1195  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1196  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1197  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1198  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1199  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1200  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1201  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1202  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1203  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1204  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1205  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1206  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1207  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1208  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1209  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1210  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1211  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1212  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1213  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1214  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1215  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1216  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1217  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1218  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1219  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1220  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1221  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1222  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1223  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1224  GLOBL masks<>(SB),RODATA,$256
  1225  
  1226  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1227  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1228  	MOVQ	$masks<>(SB), AX
  1229  	MOVQ	$shifts<>(SB), BX
  1230  	ORQ	BX, AX
  1231  	TESTQ	$15, AX
  1232  	SETEQ	ret+0(FP)
  1233  	RET
  1234  
  1235  // these are arguments to pshufb.  They move data down from
  1236  // the high bytes of the register to the low bytes of the register.
  1237  // index is how many bytes to move.
  1238  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1239  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1240  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1241  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1242  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1243  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1244  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1245  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1246  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1247  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1248  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1249  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1250  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1251  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1252  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1253  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1254  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1255  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1256  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1257  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1258  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1259  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1260  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1261  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1262  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1263  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1264  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1265  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1266  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1267  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1268  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1269  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1270  GLOBL shifts<>(SB),RODATA,$256
  1271  
  1272  TEXT runtime·memeq(SB),NOSPLIT,$0-25
  1273  	MOVQ	a+0(FP), SI
  1274  	MOVQ	b+8(FP), DI
  1275  	MOVQ	size+16(FP), BX
  1276  	LEAQ	ret+24(FP), AX
  1277  	JMP	runtime·memeqbody(SB)
  1278  
  1279  // memequal_varlen(a, b unsafe.Pointer) bool
  1280  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1281  	MOVQ	a+0(FP), SI
  1282  	MOVQ	b+8(FP), DI
  1283  	CMPQ	SI, DI
  1284  	JEQ	eq
  1285  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1286  	LEAQ	ret+16(FP), AX
  1287  	JMP	runtime·memeqbody(SB)
  1288  eq:
  1289  	MOVB	$1, ret+16(FP)
  1290  	RET
  1291  
  1292  // eqstring tests whether two strings are equal.
  1293  // The compiler guarantees that strings passed
  1294  // to eqstring have equal length.
  1295  // See runtime_test.go:eqstring_generic for
  1296  // equivalent Go code.
  1297  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1298  	MOVQ	s1str+0(FP), SI
  1299  	MOVQ	s2str+16(FP), DI
  1300  	CMPQ	SI, DI
  1301  	JEQ	eq
  1302  	MOVQ	s1len+8(FP), BX
  1303  	LEAQ	v+32(FP), AX
  1304  	JMP	runtime·memeqbody(SB)
  1305  eq:
  1306  	MOVB	$1, v+32(FP)
  1307  	RET
  1308  
  1309  // a in SI
  1310  // b in DI
  1311  // count in BX
  1312  // address of result byte in AX
  1313  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1314  	CMPQ	BX, $8
  1315  	JB	small
  1316  	CMPQ	BX, $64
  1317  	JB	bigloop
  1318  	CMPB    runtime·support_avx2(SB), $1
  1319  	JE	hugeloop_avx2
  1320  	
  1321  	// 64 bytes at a time using xmm registers
  1322  hugeloop:
  1323  	CMPQ	BX, $64
  1324  	JB	bigloop
  1325  	MOVOU	(SI), X0
  1326  	MOVOU	(DI), X1
  1327  	MOVOU	16(SI), X2
  1328  	MOVOU	16(DI), X3
  1329  	MOVOU	32(SI), X4
  1330  	MOVOU	32(DI), X5
  1331  	MOVOU	48(SI), X6
  1332  	MOVOU	48(DI), X7
  1333  	PCMPEQB	X1, X0
  1334  	PCMPEQB	X3, X2
  1335  	PCMPEQB	X5, X4
  1336  	PCMPEQB	X7, X6
  1337  	PAND	X2, X0
  1338  	PAND	X6, X4
  1339  	PAND	X4, X0
  1340  	PMOVMSKB X0, DX
  1341  	ADDQ	$64, SI
  1342  	ADDQ	$64, DI
  1343  	SUBQ	$64, BX
  1344  	CMPL	DX, $0xffff
  1345  	JEQ	hugeloop
  1346  	MOVB	$0, (AX)
  1347  	RET
  1348  
  1349  	// 64 bytes at a time using ymm registers
  1350  hugeloop_avx2:
  1351  	CMPQ	BX, $64
  1352  	JB	bigloop_avx2
  1353  	MOVHDU	(SI), X0
  1354  	MOVHDU	(DI), X1
  1355  	MOVHDU	32(SI), X2
  1356  	MOVHDU	32(DI), X3
  1357  	VPCMPEQB	X1, X0, X4
  1358  	VPCMPEQB	X2, X3, X5
  1359  	VPAND	X4, X5, X6
  1360  	VPMOVMSKB X6, DX
  1361  	ADDQ	$64, SI
  1362  	ADDQ	$64, DI
  1363  	SUBQ	$64, BX
  1364  	CMPL	DX, $0xffffffff
  1365  	JEQ	hugeloop_avx2
  1366  	VZEROUPPER
  1367  	MOVB	$0, (AX)
  1368  	RET
  1369  
  1370  bigloop_avx2:
  1371  	VZEROUPPER
  1372  
  1373  	// 8 bytes at a time using 64-bit register
  1374  bigloop:
  1375  	CMPQ	BX, $8
  1376  	JBE	leftover
  1377  	MOVQ	(SI), CX
  1378  	MOVQ	(DI), DX
  1379  	ADDQ	$8, SI
  1380  	ADDQ	$8, DI
  1381  	SUBQ	$8, BX
  1382  	CMPQ	CX, DX
  1383  	JEQ	bigloop
  1384  	MOVB	$0, (AX)
  1385  	RET
  1386  
  1387  	// remaining 0-8 bytes
  1388  leftover:
  1389  	MOVQ	-8(SI)(BX*1), CX
  1390  	MOVQ	-8(DI)(BX*1), DX
  1391  	CMPQ	CX, DX
  1392  	SETEQ	(AX)
  1393  	RET
  1394  
  1395  small:
  1396  	CMPQ	BX, $0
  1397  	JEQ	equal
  1398  
  1399  	LEAQ	0(BX*8), CX
  1400  	NEGQ	CX
  1401  
  1402  	CMPB	SI, $0xf8
  1403  	JA	si_high
  1404  
  1405  	// load at SI won't cross a page boundary.
  1406  	MOVQ	(SI), SI
  1407  	JMP	si_finish
  1408  si_high:
  1409  	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
  1410  	MOVQ	-8(SI)(BX*1), SI
  1411  	SHRQ	CX, SI
  1412  si_finish:
  1413  
  1414  	// same for DI.
  1415  	CMPB	DI, $0xf8
  1416  	JA	di_high
  1417  	MOVQ	(DI), DI
  1418  	JMP	di_finish
  1419  di_high:
  1420  	MOVQ	-8(DI)(BX*1), DI
  1421  	SHRQ	CX, DI
  1422  di_finish:
  1423  
  1424  	SUBQ	SI, DI
  1425  	SHLQ	CX, DI
  1426  equal:
  1427  	SETEQ	(AX)
  1428  	RET
  1429  
  1430  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1431  	MOVQ	s1_base+0(FP), SI
  1432  	MOVQ	s1_len+8(FP), BX
  1433  	MOVQ	s2_base+16(FP), DI
  1434  	MOVQ	s2_len+24(FP), DX
  1435  	LEAQ	ret+32(FP), R9
  1436  	JMP	runtime·cmpbody(SB)
  1437  
  1438  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1439  	MOVQ	s1+0(FP), SI
  1440  	MOVQ	s1+8(FP), BX
  1441  	MOVQ	s2+24(FP), DI
  1442  	MOVQ	s2+32(FP), DX
  1443  	LEAQ	res+48(FP), R9
  1444  	JMP	runtime·cmpbody(SB)
  1445  
  1446  // input:
  1447  //   SI = a
  1448  //   DI = b
  1449  //   BX = alen
  1450  //   DX = blen
  1451  //   R9 = address of output word (stores -1/0/1 here)
  1452  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1453  	CMPQ	SI, DI
  1454  	JEQ	allsame
  1455  	CMPQ	BX, DX
  1456  	MOVQ	DX, R8
  1457  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1458  	CMPQ	R8, $8
  1459  	JB	small
  1460  
  1461  	CMPQ	R8, $63
  1462  	JBE	loop
  1463  	CMPB    runtime·support_avx2(SB), $1
  1464  	JEQ     big_loop_avx2
  1465  	JMP	big_loop
  1466  loop:
  1467  	CMPQ	R8, $16
  1468  	JBE	_0through16
  1469  	MOVOU	(SI), X0
  1470  	MOVOU	(DI), X1
  1471  	PCMPEQB X0, X1
  1472  	PMOVMSKB X1, AX
  1473  	XORQ	$0xffff, AX	// convert EQ to NE
  1474  	JNE	diff16	// branch if at least one byte is not equal
  1475  	ADDQ	$16, SI
  1476  	ADDQ	$16, DI
  1477  	SUBQ	$16, R8
  1478  	JMP	loop
  1479  	
  1480  diff64:
  1481  	ADDQ	$48, SI
  1482  	ADDQ	$48, DI
  1483  	JMP	diff16
  1484  diff48:
  1485  	ADDQ	$32, SI
  1486  	ADDQ	$32, DI
  1487  	JMP	diff16
  1488  diff32:
  1489  	ADDQ	$16, SI
  1490  	ADDQ	$16, DI
  1491  	// AX = bit mask of differences
  1492  diff16:
  1493  	BSFQ	AX, BX	// index of first byte that differs
  1494  	XORQ	AX, AX
  1495  	MOVB	(SI)(BX*1), CX
  1496  	CMPB	CX, (DI)(BX*1)
  1497  	SETHI	AX
  1498  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1499  	MOVQ	AX, (R9)
  1500  	RET
  1501  
  1502  	// 0 through 16 bytes left, alen>=8, blen>=8
  1503  _0through16:
  1504  	CMPQ	R8, $8
  1505  	JBE	_0through8
  1506  	MOVQ	(SI), AX
  1507  	MOVQ	(DI), CX
  1508  	CMPQ	AX, CX
  1509  	JNE	diff8
  1510  _0through8:
  1511  	MOVQ	-8(SI)(R8*1), AX
  1512  	MOVQ	-8(DI)(R8*1), CX
  1513  	CMPQ	AX, CX
  1514  	JEQ	allsame
  1515  
  1516  	// AX and CX contain parts of a and b that differ.
  1517  diff8:
  1518  	BSWAPQ	AX	// reverse order of bytes
  1519  	BSWAPQ	CX
  1520  	XORQ	AX, CX
  1521  	BSRQ	CX, CX	// index of highest bit difference
  1522  	SHRQ	CX, AX	// move a's bit to bottom
  1523  	ANDQ	$1, AX	// mask bit
  1524  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1525  	MOVQ	AX, (R9)
  1526  	RET
  1527  
  1528  	// 0-7 bytes in common
  1529  small:
  1530  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1531  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1532  	JEQ	allsame
  1533  
  1534  	// load bytes of a into high bytes of AX
  1535  	CMPB	SI, $0xf8
  1536  	JA	si_high
  1537  	MOVQ	(SI), SI
  1538  	JMP	si_finish
  1539  si_high:
  1540  	MOVQ	-8(SI)(R8*1), SI
  1541  	SHRQ	CX, SI
  1542  si_finish:
  1543  	SHLQ	CX, SI
  1544  
  1545  	// load bytes of b in to high bytes of BX
  1546  	CMPB	DI, $0xf8
  1547  	JA	di_high
  1548  	MOVQ	(DI), DI
  1549  	JMP	di_finish
  1550  di_high:
  1551  	MOVQ	-8(DI)(R8*1), DI
  1552  	SHRQ	CX, DI
  1553  di_finish:
  1554  	SHLQ	CX, DI
  1555  
  1556  	BSWAPQ	SI	// reverse order of bytes
  1557  	BSWAPQ	DI
  1558  	XORQ	SI, DI	// find bit differences
  1559  	JEQ	allsame
  1560  	BSRQ	DI, CX	// index of highest bit difference
  1561  	SHRQ	CX, SI	// move a's bit to bottom
  1562  	ANDQ	$1, SI	// mask bit
  1563  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1564  	MOVQ	AX, (R9)
  1565  	RET
  1566  
  1567  allsame:
  1568  	XORQ	AX, AX
  1569  	XORQ	CX, CX
  1570  	CMPQ	BX, DX
  1571  	SETGT	AX	// 1 if alen > blen
  1572  	SETEQ	CX	// 1 if alen == blen
  1573  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1574  	MOVQ	AX, (R9)
  1575  	RET
  1576  
  1577  	// this works for >= 64 bytes of data.
  1578  big_loop:
  1579  	MOVOU	(SI), X0
  1580  	MOVOU	(DI), X1
  1581  	PCMPEQB X0, X1
  1582  	PMOVMSKB X1, AX
  1583  	XORQ	$0xffff, AX
  1584  	JNE	diff16
  1585  
  1586  	MOVOU	16(SI), X0
  1587  	MOVOU	16(DI), X1
  1588  	PCMPEQB X0, X1
  1589  	PMOVMSKB X1, AX
  1590  	XORQ	$0xffff, AX
  1591  	JNE	diff32
  1592  
  1593  	MOVOU	32(SI), X0
  1594  	MOVOU	32(DI), X1
  1595  	PCMPEQB X0, X1
  1596  	PMOVMSKB X1, AX
  1597  	XORQ	$0xffff, AX
  1598  	JNE	diff48
  1599  
  1600  	MOVOU	48(SI), X0
  1601  	MOVOU	48(DI), X1
  1602  	PCMPEQB X0, X1
  1603  	PMOVMSKB X1, AX
  1604  	XORQ	$0xffff, AX
  1605  	JNE	diff64
  1606  
  1607  	ADDQ	$64, SI
  1608  	ADDQ	$64, DI
  1609  	SUBQ	$64, R8
  1610  	CMPQ	R8, $64
  1611  	JBE	loop
  1612  	JMP	big_loop
  1613  
  1614  	// Compare 64-bytes per loop iteration.
  1615  	// Loop is unrolled and uses AVX2.
  1616  big_loop_avx2:
  1617  	MOVHDU	(SI), X2
  1618  	MOVHDU	(DI), X3
  1619  	MOVHDU	32(SI), X4
  1620  	MOVHDU	32(DI), X5
  1621  	VPCMPEQB X2, X3, X0
  1622  	VPMOVMSKB X0, AX
  1623  	XORL	$0xffffffff, AX
  1624  	JNE	diff32_avx2
  1625  	VPCMPEQB X4, X5, X6
  1626  	VPMOVMSKB X6, AX
  1627  	XORL	$0xffffffff, AX
  1628  	JNE	diff64_avx2
  1629  
  1630  	ADDQ	$64, SI
  1631  	ADDQ	$64, DI
  1632  	SUBQ	$64, R8
  1633  	CMPQ	R8, $64
  1634  	JB	big_loop_avx2_exit
  1635  	JMP	big_loop_avx2
  1636  
  1637  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1638  diff32_avx2:
  1639  	VZEROUPPER
  1640  	JMP diff16
  1641  
  1642  	// Same as diff32_avx2, but for last 32 bytes.
  1643  diff64_avx2:
  1644  	VZEROUPPER
  1645  	JMP diff48
  1646  
  1647  	// For <64 bytes remainder jump to normal loop.
  1648  big_loop_avx2_exit:
  1649  	VZEROUPPER
  1650  	JMP loop
  1651  
  1652  
  1653  // TODO: Also use this in bytes.Index
  1654  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1655  	MOVQ s+0(FP), DI
  1656  	MOVQ s_len+8(FP), CX
  1657  	MOVQ c+16(FP), AX
  1658  	MOVQ c_len+24(FP), BX
  1659  	CMPQ BX, CX
  1660  	JA fail
  1661  	CMPQ BX, $2
  1662  	JA   _3_or_more
  1663  	MOVW (AX), AX
  1664  	LEAQ -1(DI)(CX*1), CX
  1665  loop2:
  1666  	MOVW (DI), SI
  1667  	CMPW SI,AX
  1668  	JZ success
  1669  	ADDQ $1,DI
  1670  	CMPQ DI,CX
  1671  	JB loop2
  1672  	JMP fail
  1673  _3_or_more:
  1674  	CMPQ BX, $3
  1675  	JA   _4_or_more
  1676  	MOVW 1(AX), DX
  1677  	MOVW (AX), AX
  1678  	LEAQ -2(DI)(CX*1), CX
  1679  loop3:
  1680  	MOVW (DI), SI
  1681  	CMPW SI,AX
  1682  	JZ   partial_success3
  1683  	ADDQ $1,DI
  1684  	CMPQ DI,CX
  1685  	JB loop3
  1686  	JMP fail
  1687  partial_success3:
  1688  	MOVW 1(DI), SI
  1689  	CMPW SI,DX
  1690  	JZ success
  1691  	ADDQ $1,DI
  1692  	CMPQ DI,CX
  1693  	JB loop3
  1694  	JMP fail
  1695  _4_or_more:
  1696  	CMPQ BX, $4
  1697  	JA   _5_or_more
  1698  	MOVL (AX), AX
  1699  	LEAQ -3(DI)(CX*1), CX
  1700  loop4:
  1701  	MOVL (DI), SI
  1702  	CMPL SI,AX
  1703  	JZ   success
  1704  	ADDQ $1,DI
  1705  	CMPQ DI,CX
  1706  	JB loop4
  1707  	JMP fail
  1708  _5_or_more:
  1709  	CMPQ BX, $7
  1710  	JA   _8_or_more
  1711  	LEAQ 1(DI)(CX*1), CX
  1712  	SUBQ BX, CX
  1713  	MOVL -4(AX)(BX*1), DX
  1714  	MOVL (AX), AX
  1715  loop5to7:
  1716  	MOVL (DI), SI
  1717  	CMPL SI,AX
  1718  	JZ   partial_success5to7
  1719  	ADDQ $1,DI
  1720  	CMPQ DI,CX
  1721  	JB loop5to7
  1722  	JMP fail
  1723  partial_success5to7:
  1724  	MOVL -4(BX)(DI*1), SI
  1725  	CMPL SI,DX
  1726  	JZ success
  1727  	ADDQ $1,DI
  1728  	CMPQ DI,CX
  1729  	JB loop5to7
  1730  	JMP fail
  1731  _8_or_more:
  1732  	CMPQ BX, $8
  1733  	JA   _9_or_more
  1734  	MOVQ (AX), AX
  1735  	LEAQ -7(DI)(CX*1), CX
  1736  loop8:
  1737  	MOVQ (DI), SI
  1738  	CMPQ SI,AX
  1739  	JZ   success
  1740  	ADDQ $1,DI
  1741  	CMPQ DI,CX
  1742  	JB loop8
  1743  	JMP fail
  1744  _9_or_more:
  1745  	CMPQ BX, $16
  1746  	JA   _16_or_more
  1747  	LEAQ 1(DI)(CX*1), CX
  1748  	SUBQ BX, CX
  1749  	MOVQ -8(AX)(BX*1), DX
  1750  	MOVQ (AX), AX
  1751  loop9to15:
  1752  	MOVQ (DI), SI
  1753  	CMPQ SI,AX
  1754  	JZ   partial_success9to15
  1755  	ADDQ $1,DI
  1756  	CMPQ DI,CX
  1757  	JB loop9to15
  1758  	JMP fail
  1759  partial_success9to15:
  1760  	MOVQ -8(BX)(DI*1), SI
  1761  	CMPQ SI,DX
  1762  	JZ success
  1763  	ADDQ $1,DI
  1764  	CMPQ DI,CX
  1765  	JB loop9to15
  1766  	JMP fail
  1767  _16_or_more:
  1768  	CMPQ BX, $16
  1769  	JA   _17_to_31
  1770  	MOVOU (AX), X1
  1771  	LEAQ -15(DI)(CX*1), CX
  1772  loop16:
  1773  	MOVOU (DI), X2
  1774  	PCMPEQB X1, X2
  1775  	PMOVMSKB X2, SI
  1776  	CMPQ  SI, $0xffff
  1777  	JE   success
  1778  	ADDQ $1,DI
  1779  	CMPQ DI,CX
  1780  	JB loop16
  1781  	JMP fail
  1782  _17_to_31:
  1783  	LEAQ 1(DI)(CX*1), CX
  1784  	SUBQ BX, CX
  1785  	MOVOU -16(AX)(BX*1), X0
  1786  	MOVOU (AX), X1
  1787  loop17to31:
  1788  	MOVOU (DI), X2
  1789  	PCMPEQB X1,X2
  1790  	PMOVMSKB X2, SI
  1791  	CMPQ  SI, $0xffff
  1792  	JE   partial_success17to31
  1793  	ADDQ $1,DI
  1794  	CMPQ DI,CX
  1795  	JB loop17to31
  1796  	JMP fail
  1797  partial_success17to31:
  1798  	MOVOU -16(BX)(DI*1), X3
  1799  	PCMPEQB X0, X3
  1800  	PMOVMSKB X3, SI
  1801  	CMPQ  SI, $0xffff
  1802  	JE success
  1803  	ADDQ $1,DI
  1804  	CMPQ DI,CX
  1805  	JB loop17to31
  1806  fail:
  1807  	MOVQ $-1, ret+32(FP)
  1808  	RET
  1809  success:
  1810  	SUBQ s+0(FP), DI
  1811  	MOVQ DI, ret+32(FP)
  1812  	RET
  1813  
  1814  
  1815  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1816  	MOVQ s+0(FP), SI
  1817  	MOVQ s_len+8(FP), BX
  1818  	MOVB c+24(FP), AL
  1819  	LEAQ ret+32(FP), R8
  1820  	JMP  runtime·indexbytebody(SB)
  1821  
  1822  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1823  	MOVQ s+0(FP), SI
  1824  	MOVQ s_len+8(FP), BX
  1825  	MOVB c+16(FP), AL
  1826  	LEAQ ret+24(FP), R8
  1827  	JMP  runtime·indexbytebody(SB)
  1828  
  1829  // input:
  1830  //   SI: data
  1831  //   BX: data len
  1832  //   AL: byte sought
  1833  //   R8: address to put result
  1834  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1835  	MOVQ SI, DI
  1836  
  1837  	CMPQ BX, $16
  1838  	JLT small
  1839  
  1840  	CMPQ BX, $32
  1841  	JA avx2
  1842  no_avx2:
  1843  	// round up to first 16-byte boundary
  1844  	TESTQ $15, SI
  1845  	JZ aligned
  1846  	MOVQ SI, CX
  1847  	ANDQ $~15, CX
  1848  	ADDQ $16, CX
  1849  
  1850  	// search the beginning
  1851  	SUBQ SI, CX
  1852  	REPN; SCASB
  1853  	JZ success
  1854  
  1855  // DI is 16-byte aligned; get ready to search using SSE instructions
  1856  aligned:
  1857  	// round down to last 16-byte boundary
  1858  	MOVQ BX, R11
  1859  	ADDQ SI, R11
  1860  	ANDQ $~15, R11
  1861  
  1862  	// shuffle X0 around so that each byte contains c
  1863  	MOVD AX, X0
  1864  	PUNPCKLBW X0, X0
  1865  	PUNPCKLBW X0, X0
  1866  	PSHUFL $0, X0, X0
  1867  	JMP condition
  1868  
  1869  sse:
  1870  	// move the next 16-byte chunk of the buffer into X1
  1871  	MOVO (DI), X1
  1872  	// compare bytes in X0 to X1
  1873  	PCMPEQB X0, X1
  1874  	// take the top bit of each byte in X1 and put the result in DX
  1875  	PMOVMSKB X1, DX
  1876  	TESTL DX, DX
  1877  	JNZ ssesuccess
  1878  	ADDQ $16, DI
  1879  
  1880  condition:
  1881  	CMPQ DI, R11
  1882  	JLT sse
  1883  
  1884  	// search the end
  1885  	MOVQ SI, CX
  1886  	ADDQ BX, CX
  1887  	SUBQ R11, CX
  1888  	// if CX == 0, the zero flag will be set and we'll end up
  1889  	// returning a false success
  1890  	JZ failure
  1891  	REPN; SCASB
  1892  	JZ success
  1893  
  1894  failure:
  1895  	MOVQ $-1, (R8)
  1896  	RET
  1897  
  1898  // handle for lengths < 16
  1899  small:
  1900  	MOVQ BX, CX
  1901  	REPN; SCASB
  1902  	JZ success
  1903  	MOVQ $-1, (R8)
  1904  	RET
  1905  
  1906  avx2:
  1907  	CMPB   runtime·support_avx2(SB), $1
  1908  	JNE no_avx2
  1909  	MOVD AX, X0
  1910  	LEAQ -32(SI)(BX*1), R11
  1911  	VPBROADCASTB  X0, X1
  1912  avx2_loop:
  1913  	MOVHDU (DI), X2
  1914  	VPCMPEQB X1, X2, X3
  1915  	VPTEST X3, X3
  1916  	JNZ avx2success
  1917  	ADDQ $32, DI
  1918  	CMPQ DI, R11
  1919  	JLT avx2_loop
  1920  	MOVQ R11, DI
  1921  	MOVHDU (DI), X2
  1922  	VPCMPEQB X1, X2, X3
  1923  	VPTEST X3, X3
  1924  	JNZ avx2success
  1925  	VZEROUPPER
  1926  	MOVQ $-1, (R8)
  1927  	RET
  1928  
  1929  avx2success:
  1930  	VPMOVMSKB X3, DX
  1931  	BSFL DX, DX
  1932  	SUBQ SI, DI
  1933  	ADDQ DI, DX
  1934  	MOVQ DX, (R8)
  1935  	VZEROUPPER
  1936  	RET
  1937  
  1938  // we've found the chunk containing the byte
  1939  // now just figure out which specific byte it is
  1940  ssesuccess:
  1941  	// get the index of the least significant set bit
  1942  	BSFW DX, DX
  1943  	SUBQ SI, DI
  1944  	ADDQ DI, DX
  1945  	MOVQ DX, (R8)
  1946  	RET
  1947  
  1948  success:
  1949  	SUBQ SI, DI
  1950  	SUBL $1, DI
  1951  	MOVQ DI, (R8)
  1952  	RET
  1953  
  1954  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  1955  	MOVQ	a_len+8(FP), BX
  1956  	MOVQ	b_len+32(FP), CX
  1957  	CMPQ	BX, CX
  1958  	JNE	eqret
  1959  	MOVQ	a+0(FP), SI
  1960  	MOVQ	b+24(FP), DI
  1961  	LEAQ	ret+48(FP), AX
  1962  	JMP	runtime·memeqbody(SB)
  1963  eqret:
  1964  	MOVB	$0, ret+48(FP)
  1965  	RET
  1966  
  1967  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1968  	get_tls(CX)
  1969  	MOVQ	g(CX), AX
  1970  	MOVQ	g_m(AX), AX
  1971  	MOVL	m_fastrand(AX), DX
  1972  	ADDL	DX, DX
  1973  	MOVL	DX, BX
  1974  	XORL	$0x88888eef, DX
  1975  	CMOVLMI	BX, DX
  1976  	MOVL	DX, m_fastrand(AX)
  1977  	MOVL	DX, ret+0(FP)
  1978  	RET
  1979  
  1980  TEXT runtime·return0(SB), NOSPLIT, $0
  1981  	MOVL	$0, AX
  1982  	RET
  1983  
  1984  
  1985  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1986  // Must obey the gcc calling convention.
  1987  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1988  	get_tls(CX)
  1989  	MOVQ	g(CX), AX
  1990  	MOVQ	g_m(AX), AX
  1991  	MOVQ	m_curg(AX), AX
  1992  	MOVQ	(g_stack+stack_hi)(AX), AX
  1993  	RET
  1994  
  1995  // The top-most function running on a goroutine
  1996  // returns to goexit+PCQuantum.
  1997  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1998  	BYTE	$0x90	// NOP
  1999  	CALL	runtime·goexit1(SB)	// does not return
  2000  	// traceback from goexit1 must hit code range of goexit
  2001  	BYTE	$0x90	// NOP
  2002  
  2003  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2004  	MOVQ	addr+0(FP), AX
  2005  	PREFETCHT0	(AX)
  2006  	RET
  2007  
  2008  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2009  	MOVQ	addr+0(FP), AX
  2010  	PREFETCHT1	(AX)
  2011  	RET
  2012  
  2013  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2014  	MOVQ	addr+0(FP), AX
  2015  	PREFETCHT2	(AX)
  2016  	RET
  2017  
  2018  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2019  	MOVQ	addr+0(FP), AX
  2020  	PREFETCHNTA	(AX)
  2021  	RET
  2022  
  2023  // This is called from .init_array and follows the platform, not Go, ABI.
  2024  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2025  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2026  	MOVQ	runtime·lastmoduledatap(SB), AX
  2027  	MOVQ	DI, moduledata_next(AX)
  2028  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2029  	POPQ	R15
  2030  	RET