github.com/sbinet/go@v0.0.0-20160827155028-54d7de7dd62b/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	MOVQ	AX, SI
    32  	CMPQ	AX, $0
    33  	JE	nocpuinfo
    34  
    35  	// Figure out how to serialize RDTSC.
    36  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37  	// Don't know about the rest, so let's do MFENCE.
    38  	CMPL	BX, $0x756E6547  // "Genu"
    39  	JNE	notintel
    40  	CMPL	DX, $0x49656E69  // "ineI"
    41  	JNE	notintel
    42  	CMPL	CX, $0x6C65746E  // "ntel"
    43  	JNE	notintel
    44  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    45  notintel:
    46  
    47  	// Load EAX=1 cpuid flags
    48  	MOVQ	$1, AX
    49  	CPUID
    50  	MOVL	CX, runtime·cpuid_ecx(SB)
    51  	MOVL	DX, runtime·cpuid_edx(SB)
    52  
    53  	// Load EAX=7/ECX=0 cpuid flags
    54  	CMPQ	SI, $7
    55  	JLT	no7
    56  	MOVL	$7, AX
    57  	MOVL	$0, CX
    58  	CPUID
    59  	MOVL	BX, runtime·cpuid_ebx7(SB)
    60  no7:
    61  	// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
    62  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
    63  	// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
    64  	MOVL	runtime·cpuid_ecx(SB), CX
    65  	ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
    66  	CMPL    CX, $0x18000000
    67  	JNE     noavx
    68  	MOVL    $0, CX
    69  	// For XGETBV, OSXSAVE bit is required and sufficient
    70  	XGETBV
    71  	ANDL    $6, AX
    72  	CMPL    AX, $6 // Check for OS support of YMM registers
    73  	JNE     noavx
    74  	MOVB    $1, runtime·support_avx(SB)
    75  	TESTL   $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
    76  	JEQ     noavx2
    77  	MOVB    $1, runtime·support_avx2(SB)
    78  	JMP     nocpuinfo
    79  noavx:
    80  	MOVB    $0, runtime·support_avx(SB)
    81  noavx2:
    82  	MOVB    $0, runtime·support_avx2(SB)
    83  nocpuinfo:	
    84  	
    85  	// if there is an _cgo_init, call it.
    86  	MOVQ	_cgo_init(SB), AX
    87  	TESTQ	AX, AX
    88  	JZ	needtls
    89  	// g0 already in DI
    90  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    91  	MOVQ	$setg_gcc<>(SB), SI
    92  	CALL	AX
    93  
    94  	// update stackguard after _cgo_init
    95  	MOVQ	$runtime·g0(SB), CX
    96  	MOVQ	(g_stack+stack_lo)(CX), AX
    97  	ADDQ	$const__StackGuard, AX
    98  	MOVQ	AX, g_stackguard0(CX)
    99  	MOVQ	AX, g_stackguard1(CX)
   100  
   101  #ifndef GOOS_windows
   102  	JMP ok
   103  #endif
   104  needtls:
   105  #ifdef GOOS_plan9
   106  	// skip TLS setup on Plan 9
   107  	JMP ok
   108  #endif
   109  #ifdef GOOS_solaris
   110  	// skip TLS setup on Solaris
   111  	JMP ok
   112  #endif
   113  
   114  	LEAQ	runtime·m0+m_tls(SB), DI
   115  	CALL	runtime·settls(SB)
   116  
   117  	// store through it, to make sure it works
   118  	get_tls(BX)
   119  	MOVQ	$0x123, g(BX)
   120  	MOVQ	runtime·m0+m_tls(SB), AX
   121  	CMPQ	AX, $0x123
   122  	JEQ 2(PC)
   123  	MOVL	AX, 0	// abort
   124  ok:
   125  	// set the per-goroutine and per-mach "registers"
   126  	get_tls(BX)
   127  	LEAQ	runtime·g0(SB), CX
   128  	MOVQ	CX, g(BX)
   129  	LEAQ	runtime·m0(SB), AX
   130  
   131  	// save m->g0 = g0
   132  	MOVQ	CX, m_g0(AX)
   133  	// save m0 to g0->m
   134  	MOVQ	AX, g_m(CX)
   135  
   136  	CLD				// convention is D is always left cleared
   137  	CALL	runtime·check(SB)
   138  
   139  	MOVL	16(SP), AX		// copy argc
   140  	MOVL	AX, 0(SP)
   141  	MOVQ	24(SP), AX		// copy argv
   142  	MOVQ	AX, 8(SP)
   143  	CALL	runtime·args(SB)
   144  	CALL	runtime·osinit(SB)
   145  	CALL	runtime·schedinit(SB)
   146  
   147  	// create a new goroutine to start program
   148  	MOVQ	$runtime·mainPC(SB), AX		// entry
   149  	PUSHQ	AX
   150  	PUSHQ	$0			// arg size
   151  	CALL	runtime·newproc(SB)
   152  	POPQ	AX
   153  	POPQ	AX
   154  
   155  	// start this M
   156  	CALL	runtime·mstart(SB)
   157  
   158  	MOVL	$0xf1, 0xf1  // crash
   159  	RET
   160  
   161  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   162  GLOBL	runtime·mainPC(SB),RODATA,$8
   163  
   164  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   165  	BYTE	$0xcc
   166  	RET
   167  
   168  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   169  	// No per-thread init.
   170  	RET
   171  
   172  /*
   173   *  go-routine
   174   */
   175  
   176  // void gosave(Gobuf*)
   177  // save state in Gobuf; setjmp
   178  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   179  	MOVQ	buf+0(FP), AX		// gobuf
   180  	LEAQ	buf+0(FP), BX		// caller's SP
   181  	MOVQ	BX, gobuf_sp(AX)
   182  	MOVQ	0(SP), BX		// caller's PC
   183  	MOVQ	BX, gobuf_pc(AX)
   184  	MOVQ	$0, gobuf_ret(AX)
   185  	MOVQ	$0, gobuf_ctxt(AX)
   186  	MOVQ	BP, gobuf_bp(AX)
   187  	get_tls(CX)
   188  	MOVQ	g(CX), BX
   189  	MOVQ	BX, gobuf_g(AX)
   190  	RET
   191  
   192  // void gogo(Gobuf*)
   193  // restore state from Gobuf; longjmp
   194  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   195  	MOVQ	buf+0(FP), BX		// gobuf
   196  	MOVQ	gobuf_g(BX), DX
   197  	MOVQ	0(DX), CX		// make sure g != nil
   198  	get_tls(CX)
   199  	MOVQ	DX, g(CX)
   200  	MOVQ	gobuf_sp(BX), SP	// restore SP
   201  	MOVQ	gobuf_ret(BX), AX
   202  	MOVQ	gobuf_ctxt(BX), DX
   203  	MOVQ	gobuf_bp(BX), BP
   204  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   205  	MOVQ	$0, gobuf_ret(BX)
   206  	MOVQ	$0, gobuf_ctxt(BX)
   207  	MOVQ	$0, gobuf_bp(BX)
   208  	MOVQ	gobuf_pc(BX), BX
   209  	JMP	BX
   210  
   211  // func mcall(fn func(*g))
   212  // Switch to m->g0's stack, call fn(g).
   213  // Fn must never return. It should gogo(&g->sched)
   214  // to keep running g.
   215  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   216  	MOVQ	fn+0(FP), DI
   217  	
   218  	get_tls(CX)
   219  	MOVQ	g(CX), AX	// save state in g->sched
   220  	MOVQ	0(SP), BX	// caller's PC
   221  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   222  	LEAQ	fn+0(FP), BX	// caller's SP
   223  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   224  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   225  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   226  
   227  	// switch to m->g0 & its stack, call fn
   228  	MOVQ	g(CX), BX
   229  	MOVQ	g_m(BX), BX
   230  	MOVQ	m_g0(BX), SI
   231  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   232  	JNE	3(PC)
   233  	MOVQ	$runtime·badmcall(SB), AX
   234  	JMP	AX
   235  	MOVQ	SI, g(CX)	// g = m->g0
   236  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   237  	PUSHQ	AX
   238  	MOVQ	DI, DX
   239  	MOVQ	0(DI), DI
   240  	CALL	DI
   241  	POPQ	AX
   242  	MOVQ	$runtime·badmcall2(SB), AX
   243  	JMP	AX
   244  	RET
   245  
   246  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   247  // of the G stack. We need to distinguish the routine that
   248  // lives at the bottom of the G stack from the one that lives
   249  // at the top of the system stack because the one at the top of
   250  // the system stack terminates the stack walk (see topofstack()).
   251  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   252  	RET
   253  
   254  // func systemstack(fn func())
   255  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   256  	MOVQ	fn+0(FP), DI	// DI = fn
   257  	get_tls(CX)
   258  	MOVQ	g(CX), AX	// AX = g
   259  	MOVQ	g_m(AX), BX	// BX = m
   260  
   261  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   262  	CMPQ	AX, DX
   263  	JEQ	noswitch
   264  
   265  	MOVQ	m_g0(BX), DX	// DX = g0
   266  	CMPQ	AX, DX
   267  	JEQ	noswitch
   268  
   269  	MOVQ	m_curg(BX), R8
   270  	CMPQ	AX, R8
   271  	JEQ	switch
   272  	
   273  	// Bad: g is not gsignal, not g0, not curg. What is it?
   274  	MOVQ	$runtime·badsystemstack(SB), AX
   275  	CALL	AX
   276  
   277  switch:
   278  	// save our state in g->sched. Pretend to
   279  	// be systemstack_switch if the G stack is scanned.
   280  	MOVQ	$runtime·systemstack_switch(SB), SI
   281  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   282  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   283  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   284  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   285  
   286  	// switch to g0
   287  	MOVQ	DX, g(CX)
   288  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   289  	// make it look like mstart called systemstack on g0, to stop traceback
   290  	SUBQ	$8, BX
   291  	MOVQ	$runtime·mstart(SB), DX
   292  	MOVQ	DX, 0(BX)
   293  	MOVQ	BX, SP
   294  
   295  	// call target function
   296  	MOVQ	DI, DX
   297  	MOVQ	0(DI), DI
   298  	CALL	DI
   299  
   300  	// switch back to g
   301  	get_tls(CX)
   302  	MOVQ	g(CX), AX
   303  	MOVQ	g_m(AX), BX
   304  	MOVQ	m_curg(BX), AX
   305  	MOVQ	AX, g(CX)
   306  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   307  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   308  	RET
   309  
   310  noswitch:
   311  	// already on m stack, just call directly
   312  	MOVQ	DI, DX
   313  	MOVQ	0(DI), DI
   314  	CALL	DI
   315  	RET
   316  
   317  /*
   318   * support for morestack
   319   */
   320  
   321  // Called during function prolog when more stack is needed.
   322  //
   323  // The traceback routines see morestack on a g0 as being
   324  // the top of a stack (for example, morestack calling newstack
   325  // calling the scheduler calling newm calling gc), so we must
   326  // record an argument size. For that purpose, it has no arguments.
   327  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   328  	// Cannot grow scheduler stack (m->g0).
   329  	get_tls(CX)
   330  	MOVQ	g(CX), BX
   331  	MOVQ	g_m(BX), BX
   332  	MOVQ	m_g0(BX), SI
   333  	CMPQ	g(CX), SI
   334  	JNE	2(PC)
   335  	INT	$3
   336  
   337  	// Cannot grow signal stack (m->gsignal).
   338  	MOVQ	m_gsignal(BX), SI
   339  	CMPQ	g(CX), SI
   340  	JNE	2(PC)
   341  	INT	$3
   342  
   343  	// Called from f.
   344  	// Set m->morebuf to f's caller.
   345  	MOVQ	8(SP), AX	// f's caller's PC
   346  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   347  	LEAQ	16(SP), AX	// f's caller's SP
   348  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   349  	get_tls(CX)
   350  	MOVQ	g(CX), SI
   351  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   352  
   353  	// Set g->sched to context in f.
   354  	MOVQ	0(SP), AX // f's PC
   355  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   356  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   357  	LEAQ	8(SP), AX // f's SP
   358  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   359  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   360  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   361  
   362  	// Call newstack on m->g0's stack.
   363  	MOVQ	m_g0(BX), BX
   364  	MOVQ	BX, g(CX)
   365  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   366  	CALL	runtime·newstack(SB)
   367  	MOVQ	$0, 0x1003	// crash if newstack returns
   368  	RET
   369  
   370  // morestack but not preserving ctxt.
   371  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   372  	MOVL	$0, DX
   373  	JMP	runtime·morestack(SB)
   374  
   375  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   376  	// We came here via a RET to an overwritten return PC.
   377  	// AX may be live. Other registers are available.
   378  
   379  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   380  	get_tls(CX)
   381  	MOVQ	g(CX), CX
   382  	MOVQ	(g_stkbar+slice_array)(CX), DX
   383  	MOVQ	g_stkbarPos(CX), BX
   384  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   385  	MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
   386  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   387  	// Assert that we're popping the right saved LR.
   388  	ADDQ	$8, R8
   389  	CMPQ	R8, SP
   390  	JEQ	2(PC)
   391  	MOVL	$0, 0
   392  	// Record that this stack barrier was hit.
   393  	ADDQ	$1, g_stkbarPos(CX)
   394  	// Jump to the original return PC.
   395  	JMP	BX
   396  
   397  // reflectcall: call a function with the given argument list
   398  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   399  // we don't have variable-sized frames, so we use a small number
   400  // of constant-sized-frame functions to encode a few bits of size in the pc.
   401  // Caution: ugly multiline assembly macros in your future!
   402  
   403  #define DISPATCH(NAME,MAXSIZE)		\
   404  	CMPQ	CX, $MAXSIZE;		\
   405  	JA	3(PC);			\
   406  	MOVQ	$NAME(SB), AX;		\
   407  	JMP	AX
   408  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   409  
   410  TEXT reflect·call(SB), NOSPLIT, $0-0
   411  	JMP	·reflectcall(SB)
   412  
   413  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   414  	MOVLQZX argsize+24(FP), CX
   415  	// NOTE(rsc): No call16, because CALLFN needs four words
   416  	// of argument space to invoke callwritebarrier.
   417  	DISPATCH(runtime·call32, 32)
   418  	DISPATCH(runtime·call64, 64)
   419  	DISPATCH(runtime·call128, 128)
   420  	DISPATCH(runtime·call256, 256)
   421  	DISPATCH(runtime·call512, 512)
   422  	DISPATCH(runtime·call1024, 1024)
   423  	DISPATCH(runtime·call2048, 2048)
   424  	DISPATCH(runtime·call4096, 4096)
   425  	DISPATCH(runtime·call8192, 8192)
   426  	DISPATCH(runtime·call16384, 16384)
   427  	DISPATCH(runtime·call32768, 32768)
   428  	DISPATCH(runtime·call65536, 65536)
   429  	DISPATCH(runtime·call131072, 131072)
   430  	DISPATCH(runtime·call262144, 262144)
   431  	DISPATCH(runtime·call524288, 524288)
   432  	DISPATCH(runtime·call1048576, 1048576)
   433  	DISPATCH(runtime·call2097152, 2097152)
   434  	DISPATCH(runtime·call4194304, 4194304)
   435  	DISPATCH(runtime·call8388608, 8388608)
   436  	DISPATCH(runtime·call16777216, 16777216)
   437  	DISPATCH(runtime·call33554432, 33554432)
   438  	DISPATCH(runtime·call67108864, 67108864)
   439  	DISPATCH(runtime·call134217728, 134217728)
   440  	DISPATCH(runtime·call268435456, 268435456)
   441  	DISPATCH(runtime·call536870912, 536870912)
   442  	DISPATCH(runtime·call1073741824, 1073741824)
   443  	MOVQ	$runtime·badreflectcall(SB), AX
   444  	JMP	AX
   445  
   446  #define CALLFN(NAME,MAXSIZE)			\
   447  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   448  	NO_LOCAL_POINTERS;			\
   449  	/* copy arguments to stack */		\
   450  	MOVQ	argptr+16(FP), SI;		\
   451  	MOVLQZX argsize+24(FP), CX;		\
   452  	MOVQ	SP, DI;				\
   453  	REP;MOVSB;				\
   454  	/* call function */			\
   455  	MOVQ	f+8(FP), DX;			\
   456  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   457  	CALL	(DX);				\
   458  	/* copy return values back */		\
   459  	MOVQ	argptr+16(FP), DI;		\
   460  	MOVLQZX	argsize+24(FP), CX;		\
   461  	MOVLQZX retoffset+28(FP), BX;		\
   462  	MOVQ	SP, SI;				\
   463  	ADDQ	BX, DI;				\
   464  	ADDQ	BX, SI;				\
   465  	SUBQ	BX, CX;				\
   466  	REP;MOVSB;				\
   467  	/* execute write barrier updates */	\
   468  	MOVQ	argtype+0(FP), DX;		\
   469  	MOVQ	argptr+16(FP), DI;		\
   470  	MOVLQZX	argsize+24(FP), CX;		\
   471  	MOVLQZX retoffset+28(FP), BX;		\
   472  	MOVQ	DX, 0(SP);			\
   473  	MOVQ	DI, 8(SP);			\
   474  	MOVQ	CX, 16(SP);			\
   475  	MOVQ	BX, 24(SP);			\
   476  	CALL	runtime·callwritebarrier(SB);	\
   477  	RET
   478  
   479  CALLFN(·call32, 32)
   480  CALLFN(·call64, 64)
   481  CALLFN(·call128, 128)
   482  CALLFN(·call256, 256)
   483  CALLFN(·call512, 512)
   484  CALLFN(·call1024, 1024)
   485  CALLFN(·call2048, 2048)
   486  CALLFN(·call4096, 4096)
   487  CALLFN(·call8192, 8192)
   488  CALLFN(·call16384, 16384)
   489  CALLFN(·call32768, 32768)
   490  CALLFN(·call65536, 65536)
   491  CALLFN(·call131072, 131072)
   492  CALLFN(·call262144, 262144)
   493  CALLFN(·call524288, 524288)
   494  CALLFN(·call1048576, 1048576)
   495  CALLFN(·call2097152, 2097152)
   496  CALLFN(·call4194304, 4194304)
   497  CALLFN(·call8388608, 8388608)
   498  CALLFN(·call16777216, 16777216)
   499  CALLFN(·call33554432, 33554432)
   500  CALLFN(·call67108864, 67108864)
   501  CALLFN(·call134217728, 134217728)
   502  CALLFN(·call268435456, 268435456)
   503  CALLFN(·call536870912, 536870912)
   504  CALLFN(·call1073741824, 1073741824)
   505  
   506  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   507  	MOVL	cycles+0(FP), AX
   508  again:
   509  	PAUSE
   510  	SUBL	$1, AX
   511  	JNZ	again
   512  	RET
   513  
   514  
   515  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   516  	// Stores are already ordered on x86, so this is just a
   517  	// compile barrier.
   518  	RET
   519  
   520  // void jmpdefer(fn, sp);
   521  // called from deferreturn.
   522  // 1. pop the caller
   523  // 2. sub 5 bytes from the callers return
   524  // 3. jmp to the argument
   525  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   526  	MOVQ	fv+0(FP), DX	// fn
   527  	MOVQ	argp+8(FP), BX	// caller sp
   528  	LEAQ	-8(BX), SP	// caller sp after CALL
   529  	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   530  	SUBQ	$5, (SP)	// return to CALL again
   531  	MOVQ	0(DX), BX
   532  	JMP	BX	// but first run the deferred function
   533  
   534  // Save state of caller into g->sched. Smashes R8, R9.
   535  TEXT gosave<>(SB),NOSPLIT,$0
   536  	get_tls(R8)
   537  	MOVQ	g(R8), R8
   538  	MOVQ	0(SP), R9
   539  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   540  	LEAQ	8(SP), R9
   541  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   542  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   543  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   544  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   545  	RET
   546  
   547  // func asmcgocall(fn, arg unsafe.Pointer) int32
   548  // Call fn(arg) on the scheduler stack,
   549  // aligned appropriately for the gcc ABI.
   550  // See cgocall.go for more details.
   551  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   552  	MOVQ	fn+0(FP), AX
   553  	MOVQ	arg+8(FP), BX
   554  
   555  	MOVQ	SP, DX
   556  
   557  	// Figure out if we need to switch to m->g0 stack.
   558  	// We get called to create new OS threads too, and those
   559  	// come in on the m->g0 stack already.
   560  	get_tls(CX)
   561  	MOVQ	g(CX), R8
   562  	CMPQ	R8, $0
   563  	JEQ	nosave
   564  	MOVQ	g_m(R8), R8
   565  	MOVQ	m_g0(R8), SI
   566  	MOVQ	g(CX), DI
   567  	CMPQ	SI, DI
   568  	JEQ	nosave
   569  	MOVQ	m_gsignal(R8), SI
   570  	CMPQ	SI, DI
   571  	JEQ	nosave
   572  	
   573  	// Switch to system stack.
   574  	MOVQ	m_g0(R8), SI
   575  	CALL	gosave<>(SB)
   576  	MOVQ	SI, g(CX)
   577  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   578  
   579  	// Now on a scheduling stack (a pthread-created stack).
   580  	// Make sure we have enough room for 4 stack-backed fast-call
   581  	// registers as per windows amd64 calling convention.
   582  	SUBQ	$64, SP
   583  	ANDQ	$~15, SP	// alignment for gcc ABI
   584  	MOVQ	DI, 48(SP)	// save g
   585  	MOVQ	(g_stack+stack_hi)(DI), DI
   586  	SUBQ	DX, DI
   587  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   588  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   589  	MOVQ	BX, CX		// CX = first argument in Win64
   590  	CALL	AX
   591  
   592  	// Restore registers, g, stack pointer.
   593  	get_tls(CX)
   594  	MOVQ	48(SP), DI
   595  	MOVQ	(g_stack+stack_hi)(DI), SI
   596  	SUBQ	40(SP), SI
   597  	MOVQ	DI, g(CX)
   598  	MOVQ	SI, SP
   599  
   600  	MOVL	AX, ret+16(FP)
   601  	RET
   602  
   603  nosave:
   604  	// Running on a system stack, perhaps even without a g.
   605  	// Having no g can happen during thread creation or thread teardown
   606  	// (see needm/dropm on Solaris, for example).
   607  	// This code is like the above sequence but without saving/restoring g
   608  	// and without worrying about the stack moving out from under us
   609  	// (because we're on a system stack, not a goroutine stack).
   610  	// The above code could be used directly if already on a system stack,
   611  	// but then the only path through this code would be a rare case on Solaris.
   612  	// Using this code for all "already on system stack" calls exercises it more,
   613  	// which should help keep it correct.
   614  	SUBQ	$64, SP
   615  	ANDQ	$~15, SP
   616  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   617  	MOVQ	DX, 40(SP)	// save original stack pointer
   618  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   619  	MOVQ	BX, CX		// CX = first argument in Win64
   620  	CALL	AX
   621  	MOVQ	40(SP), SI	// restore original stack pointer
   622  	MOVQ	SI, SP
   623  	MOVL	AX, ret+16(FP)
   624  	RET
   625  
   626  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   627  // Turn the fn into a Go func (by taking its address) and call
   628  // cgocallback_gofunc.
   629  TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   630  	LEAQ	fn+0(FP), AX
   631  	MOVQ	AX, 0(SP)
   632  	MOVQ	frame+8(FP), AX
   633  	MOVQ	AX, 8(SP)
   634  	MOVQ	framesize+16(FP), AX
   635  	MOVQ	AX, 16(SP)
   636  	MOVQ	ctxt+24(FP), AX
   637  	MOVQ	AX, 24(SP)
   638  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   639  	CALL	AX
   640  	RET
   641  
   642  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   643  // See cgocall.go for more details.
   644  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   645  	NO_LOCAL_POINTERS
   646  
   647  	// If g is nil, Go did not create the current thread.
   648  	// Call needm to obtain one m for temporary use.
   649  	// In this case, we're running on the thread stack, so there's
   650  	// lots of space, but the linker doesn't know. Hide the call from
   651  	// the linker analysis by using an indirect call through AX.
   652  	get_tls(CX)
   653  #ifdef GOOS_windows
   654  	MOVL	$0, BX
   655  	CMPQ	CX, $0
   656  	JEQ	2(PC)
   657  #endif
   658  	MOVQ	g(CX), BX
   659  	CMPQ	BX, $0
   660  	JEQ	needm
   661  	MOVQ	g_m(BX), BX
   662  	MOVQ	BX, R8 // holds oldm until end of function
   663  	JMP	havem
   664  needm:
   665  	MOVQ	$0, 0(SP)
   666  	MOVQ	$runtime·needm(SB), AX
   667  	CALL	AX
   668  	MOVQ	0(SP), R8
   669  	get_tls(CX)
   670  	MOVQ	g(CX), BX
   671  	MOVQ	g_m(BX), BX
   672  	
   673  	// Set m->sched.sp = SP, so that if a panic happens
   674  	// during the function we are about to execute, it will
   675  	// have a valid SP to run on the g0 stack.
   676  	// The next few lines (after the havem label)
   677  	// will save this SP onto the stack and then write
   678  	// the same SP back to m->sched.sp. That seems redundant,
   679  	// but if an unrecovered panic happens, unwindm will
   680  	// restore the g->sched.sp from the stack location
   681  	// and then systemstack will try to use it. If we don't set it here,
   682  	// that restored SP will be uninitialized (typically 0) and
   683  	// will not be usable.
   684  	MOVQ	m_g0(BX), SI
   685  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   686  
   687  havem:
   688  	// Now there's a valid m, and we're running on its m->g0.
   689  	// Save current m->g0->sched.sp on stack and then set it to SP.
   690  	// Save current sp in m->g0->sched.sp in preparation for
   691  	// switch back to m->curg stack.
   692  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   693  	MOVQ	m_g0(BX), SI
   694  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   695  	MOVQ	AX, 0(SP)
   696  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   697  
   698  	// Switch to m->curg stack and call runtime.cgocallbackg.
   699  	// Because we are taking over the execution of m->curg
   700  	// but *not* resuming what had been running, we need to
   701  	// save that information (m->curg->sched) so we can restore it.
   702  	// We can restore m->curg->sched.sp easily, because calling
   703  	// runtime.cgocallbackg leaves SP unchanged upon return.
   704  	// To save m->curg->sched.pc, we push it onto the stack.
   705  	// This has the added benefit that it looks to the traceback
   706  	// routine like cgocallbackg is going to return to that
   707  	// PC (because the frame we allocate below has the same
   708  	// size as cgocallback_gofunc's frame declared above)
   709  	// so that the traceback will seamlessly trace back into
   710  	// the earlier calls.
   711  	//
   712  	// In the new goroutine, 8(SP) holds the saved R8.
   713  	MOVQ	m_curg(BX), SI
   714  	MOVQ	SI, g(CX)
   715  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   716  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   717  	MOVQ	BX, -8(DI)
   718  	// Compute the size of the frame, including return PC and, if
   719  	// GOEXPERIMENT=framepointer, the saved based pointer
   720  	MOVQ	ctxt+24(FP), BX
   721  	LEAQ	fv+0(FP), AX
   722  	SUBQ	SP, AX
   723  	SUBQ	AX, DI
   724  	MOVQ	DI, SP
   725  
   726  	MOVQ	R8, 8(SP)
   727  	MOVQ	BX, 0(SP)
   728  	CALL	runtime·cgocallbackg(SB)
   729  	MOVQ	8(SP), R8
   730  
   731  	// Compute the size of the frame again. FP and SP have
   732  	// completely different values here than they did above,
   733  	// but only their difference matters.
   734  	LEAQ	fv+0(FP), AX
   735  	SUBQ	SP, AX
   736  
   737  	// Restore g->sched (== m->curg->sched) from saved values.
   738  	get_tls(CX)
   739  	MOVQ	g(CX), SI
   740  	MOVQ	SP, DI
   741  	ADDQ	AX, DI
   742  	MOVQ	-8(DI), BX
   743  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   744  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   745  
   746  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   747  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   748  	// so we do not have to restore it.)
   749  	MOVQ	g(CX), BX
   750  	MOVQ	g_m(BX), BX
   751  	MOVQ	m_g0(BX), SI
   752  	MOVQ	SI, g(CX)
   753  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   754  	MOVQ	0(SP), AX
   755  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   756  	
   757  	// If the m on entry was nil, we called needm above to borrow an m
   758  	// for the duration of the call. Since the call is over, return it with dropm.
   759  	CMPQ	R8, $0
   760  	JNE 3(PC)
   761  	MOVQ	$runtime·dropm(SB), AX
   762  	CALL	AX
   763  
   764  	// Done!
   765  	RET
   766  
   767  // void setg(G*); set g. for use by needm.
   768  TEXT runtime·setg(SB), NOSPLIT, $0-8
   769  	MOVQ	gg+0(FP), BX
   770  #ifdef GOOS_windows
   771  	CMPQ	BX, $0
   772  	JNE	settls
   773  	MOVQ	$0, 0x28(GS)
   774  	RET
   775  settls:
   776  	MOVQ	g_m(BX), AX
   777  	LEAQ	m_tls(AX), AX
   778  	MOVQ	AX, 0x28(GS)
   779  #endif
   780  	get_tls(CX)
   781  	MOVQ	BX, g(CX)
   782  	RET
   783  
   784  // void setg_gcc(G*); set g called from gcc.
   785  TEXT setg_gcc<>(SB),NOSPLIT,$0
   786  	get_tls(AX)
   787  	MOVQ	DI, g(AX)
   788  	RET
   789  
   790  // check that SP is in range [g->stack.lo, g->stack.hi)
   791  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   792  	get_tls(CX)
   793  	MOVQ	g(CX), AX
   794  	CMPQ	(g_stack+stack_hi)(AX), SP
   795  	JHI	2(PC)
   796  	INT	$3
   797  	CMPQ	SP, (g_stack+stack_lo)(AX)
   798  	JHI	2(PC)
   799  	INT	$3
   800  	RET
   801  
   802  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   803  	MOVQ	argp+0(FP),AX		// addr of first arg
   804  	MOVQ	-8(AX),AX		// get calling pc
   805  	CMPQ	AX, runtime·stackBarrierPC(SB)
   806  	JNE	nobar
   807  	// Get original return PC.
   808  	CALL	runtime·nextBarrierPC(SB)
   809  	MOVQ	0(SP), AX
   810  nobar:
   811  	MOVQ	AX, ret+8(FP)
   812  	RET
   813  
   814  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   815  	MOVQ	argp+0(FP),AX		// addr of first arg
   816  	MOVQ	pc+8(FP), BX
   817  	MOVQ	-8(AX), CX
   818  	CMPQ	CX, runtime·stackBarrierPC(SB)
   819  	JEQ	setbar
   820  	MOVQ	BX, -8(AX)		// set calling pc
   821  	RET
   822  setbar:
   823  	// Set the stack barrier return PC.
   824  	MOVQ	BX, 0(SP)
   825  	CALL	runtime·setNextBarrierPC(SB)
   826  	RET
   827  
   828  TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   829  	MOVQ	argp+0(FP), AX
   830  	MOVQ	AX, ret+8(FP)
   831  	RET
   832  
   833  // func cputicks() int64
   834  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   835  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   836  	JNE	mfence
   837  	LFENCE
   838  	JMP	done
   839  mfence:
   840  	MFENCE
   841  done:
   842  	RDTSC
   843  	SHLQ	$32, DX
   844  	ADDQ	DX, AX
   845  	MOVQ	AX, ret+0(FP)
   846  	RET
   847  
   848  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   849  // redirects to memhash(p, h, size) using the size
   850  // stored in the closure.
   851  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   852  	GO_ARGS
   853  	NO_LOCAL_POINTERS
   854  	MOVQ	p+0(FP), AX
   855  	MOVQ	h+8(FP), BX
   856  	MOVQ	8(DX), CX
   857  	MOVQ	AX, 0(SP)
   858  	MOVQ	BX, 8(SP)
   859  	MOVQ	CX, 16(SP)
   860  	CALL	runtime·memhash(SB)
   861  	MOVQ	24(SP), AX
   862  	MOVQ	AX, ret+16(FP)
   863  	RET
   864  
   865  // hash function using AES hardware instructions
   866  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   867  	MOVQ	p+0(FP), AX	// ptr to data
   868  	MOVQ	s+16(FP), CX	// size
   869  	LEAQ	ret+24(FP), DX
   870  	JMP	runtime·aeshashbody(SB)
   871  
   872  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   873  	MOVQ	p+0(FP), AX	// ptr to string struct
   874  	MOVQ	8(AX), CX	// length of string
   875  	MOVQ	(AX), AX	// string data
   876  	LEAQ	ret+16(FP), DX
   877  	JMP	runtime·aeshashbody(SB)
   878  
   879  // AX: data
   880  // CX: length
   881  // DX: address to put return value
   882  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   883  	// Fill an SSE register with our seeds.
   884  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   885  	PINSRW	$4, CX, X0			// 16 bits of length
   886  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   887  	MOVO	X0, X1				// save unscrambled seed
   888  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   889  	AESENC	X0, X0				// scramble seed
   890  
   891  	CMPQ	CX, $16
   892  	JB	aes0to15
   893  	JE	aes16
   894  	CMPQ	CX, $32
   895  	JBE	aes17to32
   896  	CMPQ	CX, $64
   897  	JBE	aes33to64
   898  	CMPQ	CX, $128
   899  	JBE	aes65to128
   900  	JMP	aes129plus
   901  
   902  aes0to15:
   903  	TESTQ	CX, CX
   904  	JE	aes0
   905  
   906  	ADDQ	$16, AX
   907  	TESTW	$0xff0, AX
   908  	JE	endofpage
   909  
   910  	// 16 bytes loaded at this address won't cross
   911  	// a page boundary, so we can load it directly.
   912  	MOVOU	-16(AX), X1
   913  	ADDQ	CX, CX
   914  	MOVQ	$masks<>(SB), AX
   915  	PAND	(AX)(CX*8), X1
   916  final1:
   917  	PXOR	X0, X1	// xor data with seed
   918  	AESENC	X1, X1	// scramble combo 3 times
   919  	AESENC	X1, X1
   920  	AESENC	X1, X1
   921  	MOVQ	X1, (DX)
   922  	RET
   923  
   924  endofpage:
   925  	// address ends in 1111xxxx. Might be up against
   926  	// a page boundary, so load ending at last byte.
   927  	// Then shift bytes down using pshufb.
   928  	MOVOU	-32(AX)(CX*1), X1
   929  	ADDQ	CX, CX
   930  	MOVQ	$shifts<>(SB), AX
   931  	PSHUFB	(AX)(CX*8), X1
   932  	JMP	final1
   933  
   934  aes0:
   935  	// Return scrambled input seed
   936  	AESENC	X0, X0
   937  	MOVQ	X0, (DX)
   938  	RET
   939  
   940  aes16:
   941  	MOVOU	(AX), X1
   942  	JMP	final1
   943  
   944  aes17to32:
   945  	// make second starting seed
   946  	PXOR	runtime·aeskeysched+16(SB), X1
   947  	AESENC	X1, X1
   948  	
   949  	// load data to be hashed
   950  	MOVOU	(AX), X2
   951  	MOVOU	-16(AX)(CX*1), X3
   952  
   953  	// xor with seed
   954  	PXOR	X0, X2
   955  	PXOR	X1, X3
   956  
   957  	// scramble 3 times
   958  	AESENC	X2, X2
   959  	AESENC	X3, X3
   960  	AESENC	X2, X2
   961  	AESENC	X3, X3
   962  	AESENC	X2, X2
   963  	AESENC	X3, X3
   964  
   965  	// combine results
   966  	PXOR	X3, X2
   967  	MOVQ	X2, (DX)
   968  	RET
   969  
   970  aes33to64:
   971  	// make 3 more starting seeds
   972  	MOVO	X1, X2
   973  	MOVO	X1, X3
   974  	PXOR	runtime·aeskeysched+16(SB), X1
   975  	PXOR	runtime·aeskeysched+32(SB), X2
   976  	PXOR	runtime·aeskeysched+48(SB), X3
   977  	AESENC	X1, X1
   978  	AESENC	X2, X2
   979  	AESENC	X3, X3
   980  	
   981  	MOVOU	(AX), X4
   982  	MOVOU	16(AX), X5
   983  	MOVOU	-32(AX)(CX*1), X6
   984  	MOVOU	-16(AX)(CX*1), X7
   985  
   986  	PXOR	X0, X4
   987  	PXOR	X1, X5
   988  	PXOR	X2, X6
   989  	PXOR	X3, X7
   990  	
   991  	AESENC	X4, X4
   992  	AESENC	X5, X5
   993  	AESENC	X6, X6
   994  	AESENC	X7, X7
   995  	
   996  	AESENC	X4, X4
   997  	AESENC	X5, X5
   998  	AESENC	X6, X6
   999  	AESENC	X7, X7
  1000  	
  1001  	AESENC	X4, X4
  1002  	AESENC	X5, X5
  1003  	AESENC	X6, X6
  1004  	AESENC	X7, X7
  1005  
  1006  	PXOR	X6, X4
  1007  	PXOR	X7, X5
  1008  	PXOR	X5, X4
  1009  	MOVQ	X4, (DX)
  1010  	RET
  1011  
  1012  aes65to128:
  1013  	// make 7 more starting seeds
  1014  	MOVO	X1, X2
  1015  	MOVO	X1, X3
  1016  	MOVO	X1, X4
  1017  	MOVO	X1, X5
  1018  	MOVO	X1, X6
  1019  	MOVO	X1, X7
  1020  	PXOR	runtime·aeskeysched+16(SB), X1
  1021  	PXOR	runtime·aeskeysched+32(SB), X2
  1022  	PXOR	runtime·aeskeysched+48(SB), X3
  1023  	PXOR	runtime·aeskeysched+64(SB), X4
  1024  	PXOR	runtime·aeskeysched+80(SB), X5
  1025  	PXOR	runtime·aeskeysched+96(SB), X6
  1026  	PXOR	runtime·aeskeysched+112(SB), X7
  1027  	AESENC	X1, X1
  1028  	AESENC	X2, X2
  1029  	AESENC	X3, X3
  1030  	AESENC	X4, X4
  1031  	AESENC	X5, X5
  1032  	AESENC	X6, X6
  1033  	AESENC	X7, X7
  1034  
  1035  	// load data
  1036  	MOVOU	(AX), X8
  1037  	MOVOU	16(AX), X9
  1038  	MOVOU	32(AX), X10
  1039  	MOVOU	48(AX), X11
  1040  	MOVOU	-64(AX)(CX*1), X12
  1041  	MOVOU	-48(AX)(CX*1), X13
  1042  	MOVOU	-32(AX)(CX*1), X14
  1043  	MOVOU	-16(AX)(CX*1), X15
  1044  
  1045  	// xor with seed
  1046  	PXOR	X0, X8
  1047  	PXOR	X1, X9
  1048  	PXOR	X2, X10
  1049  	PXOR	X3, X11
  1050  	PXOR	X4, X12
  1051  	PXOR	X5, X13
  1052  	PXOR	X6, X14
  1053  	PXOR	X7, X15
  1054  
  1055  	// scramble 3 times
  1056  	AESENC	X8, X8
  1057  	AESENC	X9, X9
  1058  	AESENC	X10, X10
  1059  	AESENC	X11, X11
  1060  	AESENC	X12, X12
  1061  	AESENC	X13, X13
  1062  	AESENC	X14, X14
  1063  	AESENC	X15, X15
  1064  
  1065  	AESENC	X8, X8
  1066  	AESENC	X9, X9
  1067  	AESENC	X10, X10
  1068  	AESENC	X11, X11
  1069  	AESENC	X12, X12
  1070  	AESENC	X13, X13
  1071  	AESENC	X14, X14
  1072  	AESENC	X15, X15
  1073  
  1074  	AESENC	X8, X8
  1075  	AESENC	X9, X9
  1076  	AESENC	X10, X10
  1077  	AESENC	X11, X11
  1078  	AESENC	X12, X12
  1079  	AESENC	X13, X13
  1080  	AESENC	X14, X14
  1081  	AESENC	X15, X15
  1082  
  1083  	// combine results
  1084  	PXOR	X12, X8
  1085  	PXOR	X13, X9
  1086  	PXOR	X14, X10
  1087  	PXOR	X15, X11
  1088  	PXOR	X10, X8
  1089  	PXOR	X11, X9
  1090  	PXOR	X9, X8
  1091  	MOVQ	X8, (DX)
  1092  	RET
  1093  
  1094  aes129plus:
  1095  	// make 7 more starting seeds
  1096  	MOVO	X1, X2
  1097  	MOVO	X1, X3
  1098  	MOVO	X1, X4
  1099  	MOVO	X1, X5
  1100  	MOVO	X1, X6
  1101  	MOVO	X1, X7
  1102  	PXOR	runtime·aeskeysched+16(SB), X1
  1103  	PXOR	runtime·aeskeysched+32(SB), X2
  1104  	PXOR	runtime·aeskeysched+48(SB), X3
  1105  	PXOR	runtime·aeskeysched+64(SB), X4
  1106  	PXOR	runtime·aeskeysched+80(SB), X5
  1107  	PXOR	runtime·aeskeysched+96(SB), X6
  1108  	PXOR	runtime·aeskeysched+112(SB), X7
  1109  	AESENC	X1, X1
  1110  	AESENC	X2, X2
  1111  	AESENC	X3, X3
  1112  	AESENC	X4, X4
  1113  	AESENC	X5, X5
  1114  	AESENC	X6, X6
  1115  	AESENC	X7, X7
  1116  	
  1117  	// start with last (possibly overlapping) block
  1118  	MOVOU	-128(AX)(CX*1), X8
  1119  	MOVOU	-112(AX)(CX*1), X9
  1120  	MOVOU	-96(AX)(CX*1), X10
  1121  	MOVOU	-80(AX)(CX*1), X11
  1122  	MOVOU	-64(AX)(CX*1), X12
  1123  	MOVOU	-48(AX)(CX*1), X13
  1124  	MOVOU	-32(AX)(CX*1), X14
  1125  	MOVOU	-16(AX)(CX*1), X15
  1126  
  1127  	// xor in seed
  1128  	PXOR	X0, X8
  1129  	PXOR	X1, X9
  1130  	PXOR	X2, X10
  1131  	PXOR	X3, X11
  1132  	PXOR	X4, X12
  1133  	PXOR	X5, X13
  1134  	PXOR	X6, X14
  1135  	PXOR	X7, X15
  1136  	
  1137  	// compute number of remaining 128-byte blocks
  1138  	DECQ	CX
  1139  	SHRQ	$7, CX
  1140  	
  1141  aesloop:
  1142  	// scramble state
  1143  	AESENC	X8, X8
  1144  	AESENC	X9, X9
  1145  	AESENC	X10, X10
  1146  	AESENC	X11, X11
  1147  	AESENC	X12, X12
  1148  	AESENC	X13, X13
  1149  	AESENC	X14, X14
  1150  	AESENC	X15, X15
  1151  
  1152  	// scramble state, xor in a block
  1153  	MOVOU	(AX), X0
  1154  	MOVOU	16(AX), X1
  1155  	MOVOU	32(AX), X2
  1156  	MOVOU	48(AX), X3
  1157  	AESENC	X0, X8
  1158  	AESENC	X1, X9
  1159  	AESENC	X2, X10
  1160  	AESENC	X3, X11
  1161  	MOVOU	64(AX), X4
  1162  	MOVOU	80(AX), X5
  1163  	MOVOU	96(AX), X6
  1164  	MOVOU	112(AX), X7
  1165  	AESENC	X4, X12
  1166  	AESENC	X5, X13
  1167  	AESENC	X6, X14
  1168  	AESENC	X7, X15
  1169  
  1170  	ADDQ	$128, AX
  1171  	DECQ	CX
  1172  	JNE	aesloop
  1173  
  1174  	// 3 more scrambles to finish
  1175  	AESENC	X8, X8
  1176  	AESENC	X9, X9
  1177  	AESENC	X10, X10
  1178  	AESENC	X11, X11
  1179  	AESENC	X12, X12
  1180  	AESENC	X13, X13
  1181  	AESENC	X14, X14
  1182  	AESENC	X15, X15
  1183  	AESENC	X8, X8
  1184  	AESENC	X9, X9
  1185  	AESENC	X10, X10
  1186  	AESENC	X11, X11
  1187  	AESENC	X12, X12
  1188  	AESENC	X13, X13
  1189  	AESENC	X14, X14
  1190  	AESENC	X15, X15
  1191  	AESENC	X8, X8
  1192  	AESENC	X9, X9
  1193  	AESENC	X10, X10
  1194  	AESENC	X11, X11
  1195  	AESENC	X12, X12
  1196  	AESENC	X13, X13
  1197  	AESENC	X14, X14
  1198  	AESENC	X15, X15
  1199  
  1200  	PXOR	X12, X8
  1201  	PXOR	X13, X9
  1202  	PXOR	X14, X10
  1203  	PXOR	X15, X11
  1204  	PXOR	X10, X8
  1205  	PXOR	X11, X9
  1206  	PXOR	X9, X8
  1207  	MOVQ	X8, (DX)
  1208  	RET
  1209  	
  1210  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1211  	MOVQ	p+0(FP), AX	// ptr to data
  1212  	MOVQ	h+8(FP), X0	// seed
  1213  	PINSRD	$2, (AX), X0	// data
  1214  	AESENC	runtime·aeskeysched+0(SB), X0
  1215  	AESENC	runtime·aeskeysched+16(SB), X0
  1216  	AESENC	runtime·aeskeysched+32(SB), X0
  1217  	MOVQ	X0, ret+16(FP)
  1218  	RET
  1219  
  1220  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1221  	MOVQ	p+0(FP), AX	// ptr to data
  1222  	MOVQ	h+8(FP), X0	// seed
  1223  	PINSRQ	$1, (AX), X0	// data
  1224  	AESENC	runtime·aeskeysched+0(SB), X0
  1225  	AESENC	runtime·aeskeysched+16(SB), X0
  1226  	AESENC	runtime·aeskeysched+32(SB), X0
  1227  	MOVQ	X0, ret+16(FP)
  1228  	RET
  1229  
  1230  // simple mask to get rid of data in the high part of the register.
  1231  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1232  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1233  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1234  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1235  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1236  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1237  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1238  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1239  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1240  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1241  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1242  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1243  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1244  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1245  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1246  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1247  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1248  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1249  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1250  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1251  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1252  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1253  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1254  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1255  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1256  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1257  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1258  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1259  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1260  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1261  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1262  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1263  GLOBL masks<>(SB),RODATA,$256
  1264  
  1265  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1266  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1267  	MOVQ	$masks<>(SB), AX
  1268  	MOVQ	$shifts<>(SB), BX
  1269  	ORQ	BX, AX
  1270  	TESTQ	$15, AX
  1271  	SETEQ	ret+0(FP)
  1272  	RET
  1273  
  1274  // these are arguments to pshufb. They move data down from
  1275  // the high bytes of the register to the low bytes of the register.
  1276  // index is how many bytes to move.
  1277  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1278  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1279  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1280  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1281  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1282  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1283  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1284  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1285  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1286  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1287  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1288  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1289  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1290  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1291  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1292  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1293  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1294  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1295  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1296  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1297  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1298  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1299  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1300  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1301  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1302  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1303  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1304  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1305  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1306  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1307  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1308  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1309  GLOBL shifts<>(SB),RODATA,$256
  1310  
  1311  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1312  TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1313  	MOVQ	a+0(FP), SI
  1314  	MOVQ	b+8(FP), DI
  1315  	CMPQ	SI, DI
  1316  	JEQ	eq
  1317  	MOVQ	size+16(FP), BX
  1318  	LEAQ	ret+24(FP), AX
  1319  	JMP	runtime·memeqbody(SB)
  1320  eq:
  1321  	MOVB	$1, ret+24(FP)
  1322  	RET
  1323  
  1324  // memequal_varlen(a, b unsafe.Pointer) bool
  1325  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1326  	MOVQ	a+0(FP), SI
  1327  	MOVQ	b+8(FP), DI
  1328  	CMPQ	SI, DI
  1329  	JEQ	eq
  1330  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1331  	LEAQ	ret+16(FP), AX
  1332  	JMP	runtime·memeqbody(SB)
  1333  eq:
  1334  	MOVB	$1, ret+16(FP)
  1335  	RET
  1336  
  1337  // eqstring tests whether two strings are equal.
  1338  // The compiler guarantees that strings passed
  1339  // to eqstring have equal length.
  1340  // See runtime_test.go:eqstring_generic for
  1341  // equivalent Go code.
  1342  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1343  	MOVQ	s1_base+0(FP), SI
  1344  	MOVQ	s2_base+16(FP), DI
  1345  	CMPQ	SI, DI
  1346  	JEQ	eq
  1347  	MOVQ	s1_len+8(FP), BX
  1348  	LEAQ	ret+32(FP), AX
  1349  	JMP	runtime·memeqbody(SB)
  1350  eq:
  1351  	MOVB	$1, ret+32(FP)
  1352  	RET
  1353  
  1354  // a in SI
  1355  // b in DI
  1356  // count in BX
  1357  // address of result byte in AX
  1358  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1359  	CMPQ	BX, $8
  1360  	JB	small
  1361  	CMPQ	BX, $64
  1362  	JB	bigloop
  1363  	CMPB    runtime·support_avx2(SB), $1
  1364  	JE	hugeloop_avx2
  1365  	
  1366  	// 64 bytes at a time using xmm registers
  1367  hugeloop:
  1368  	CMPQ	BX, $64
  1369  	JB	bigloop
  1370  	MOVOU	(SI), X0
  1371  	MOVOU	(DI), X1
  1372  	MOVOU	16(SI), X2
  1373  	MOVOU	16(DI), X3
  1374  	MOVOU	32(SI), X4
  1375  	MOVOU	32(DI), X5
  1376  	MOVOU	48(SI), X6
  1377  	MOVOU	48(DI), X7
  1378  	PCMPEQB	X1, X0
  1379  	PCMPEQB	X3, X2
  1380  	PCMPEQB	X5, X4
  1381  	PCMPEQB	X7, X6
  1382  	PAND	X2, X0
  1383  	PAND	X6, X4
  1384  	PAND	X4, X0
  1385  	PMOVMSKB X0, DX
  1386  	ADDQ	$64, SI
  1387  	ADDQ	$64, DI
  1388  	SUBQ	$64, BX
  1389  	CMPL	DX, $0xffff
  1390  	JEQ	hugeloop
  1391  	MOVB	$0, (AX)
  1392  	RET
  1393  
  1394  	// 64 bytes at a time using ymm registers
  1395  hugeloop_avx2:
  1396  	CMPQ	BX, $64
  1397  	JB	bigloop_avx2
  1398  	VMOVDQU	(SI), Y0
  1399  	VMOVDQU	(DI), Y1
  1400  	VMOVDQU	32(SI), Y2
  1401  	VMOVDQU	32(DI), Y3
  1402  	VPCMPEQB	Y1, Y0, Y4
  1403  	VPCMPEQB	Y2, Y3, Y5
  1404  	VPAND	Y4, Y5, Y6
  1405  	VPMOVMSKB Y6, DX
  1406  	ADDQ	$64, SI
  1407  	ADDQ	$64, DI
  1408  	SUBQ	$64, BX
  1409  	CMPL	DX, $0xffffffff
  1410  	JEQ	hugeloop_avx2
  1411  	VZEROUPPER
  1412  	MOVB	$0, (AX)
  1413  	RET
  1414  
  1415  bigloop_avx2:
  1416  	VZEROUPPER
  1417  
  1418  	// 8 bytes at a time using 64-bit register
  1419  bigloop:
  1420  	CMPQ	BX, $8
  1421  	JBE	leftover
  1422  	MOVQ	(SI), CX
  1423  	MOVQ	(DI), DX
  1424  	ADDQ	$8, SI
  1425  	ADDQ	$8, DI
  1426  	SUBQ	$8, BX
  1427  	CMPQ	CX, DX
  1428  	JEQ	bigloop
  1429  	MOVB	$0, (AX)
  1430  	RET
  1431  
  1432  	// remaining 0-8 bytes
  1433  leftover:
  1434  	MOVQ	-8(SI)(BX*1), CX
  1435  	MOVQ	-8(DI)(BX*1), DX
  1436  	CMPQ	CX, DX
  1437  	SETEQ	(AX)
  1438  	RET
  1439  
  1440  small:
  1441  	CMPQ	BX, $0
  1442  	JEQ	equal
  1443  
  1444  	LEAQ	0(BX*8), CX
  1445  	NEGQ	CX
  1446  
  1447  	CMPB	SI, $0xf8
  1448  	JA	si_high
  1449  
  1450  	// load at SI won't cross a page boundary.
  1451  	MOVQ	(SI), SI
  1452  	JMP	si_finish
  1453  si_high:
  1454  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1455  	MOVQ	-8(SI)(BX*1), SI
  1456  	SHRQ	CX, SI
  1457  si_finish:
  1458  
  1459  	// same for DI.
  1460  	CMPB	DI, $0xf8
  1461  	JA	di_high
  1462  	MOVQ	(DI), DI
  1463  	JMP	di_finish
  1464  di_high:
  1465  	MOVQ	-8(DI)(BX*1), DI
  1466  	SHRQ	CX, DI
  1467  di_finish:
  1468  
  1469  	SUBQ	SI, DI
  1470  	SHLQ	CX, DI
  1471  equal:
  1472  	SETEQ	(AX)
  1473  	RET
  1474  
  1475  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1476  	MOVQ	s1_base+0(FP), SI
  1477  	MOVQ	s1_len+8(FP), BX
  1478  	MOVQ	s2_base+16(FP), DI
  1479  	MOVQ	s2_len+24(FP), DX
  1480  	LEAQ	ret+32(FP), R9
  1481  	JMP	runtime·cmpbody(SB)
  1482  
  1483  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1484  	MOVQ	s1+0(FP), SI
  1485  	MOVQ	s1+8(FP), BX
  1486  	MOVQ	s2+24(FP), DI
  1487  	MOVQ	s2+32(FP), DX
  1488  	LEAQ	res+48(FP), R9
  1489  	JMP	runtime·cmpbody(SB)
  1490  
  1491  // input:
  1492  //   SI = a
  1493  //   DI = b
  1494  //   BX = alen
  1495  //   DX = blen
  1496  //   R9 = address of output word (stores -1/0/1 here)
  1497  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1498  	CMPQ	SI, DI
  1499  	JEQ	allsame
  1500  	CMPQ	BX, DX
  1501  	MOVQ	DX, R8
  1502  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1503  	CMPQ	R8, $8
  1504  	JB	small
  1505  
  1506  	CMPQ	R8, $63
  1507  	JBE	loop
  1508  	CMPB    runtime·support_avx2(SB), $1
  1509  	JEQ     big_loop_avx2
  1510  	JMP	big_loop
  1511  loop:
  1512  	CMPQ	R8, $16
  1513  	JBE	_0through16
  1514  	MOVOU	(SI), X0
  1515  	MOVOU	(DI), X1
  1516  	PCMPEQB X0, X1
  1517  	PMOVMSKB X1, AX
  1518  	XORQ	$0xffff, AX	// convert EQ to NE
  1519  	JNE	diff16	// branch if at least one byte is not equal
  1520  	ADDQ	$16, SI
  1521  	ADDQ	$16, DI
  1522  	SUBQ	$16, R8
  1523  	JMP	loop
  1524  	
  1525  diff64:
  1526  	ADDQ	$48, SI
  1527  	ADDQ	$48, DI
  1528  	JMP	diff16
  1529  diff48:
  1530  	ADDQ	$32, SI
  1531  	ADDQ	$32, DI
  1532  	JMP	diff16
  1533  diff32:
  1534  	ADDQ	$16, SI
  1535  	ADDQ	$16, DI
  1536  	// AX = bit mask of differences
  1537  diff16:
  1538  	BSFQ	AX, BX	// index of first byte that differs
  1539  	XORQ	AX, AX
  1540  	MOVB	(SI)(BX*1), CX
  1541  	CMPB	CX, (DI)(BX*1)
  1542  	SETHI	AX
  1543  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1544  	MOVQ	AX, (R9)
  1545  	RET
  1546  
  1547  	// 0 through 16 bytes left, alen>=8, blen>=8
  1548  _0through16:
  1549  	CMPQ	R8, $8
  1550  	JBE	_0through8
  1551  	MOVQ	(SI), AX
  1552  	MOVQ	(DI), CX
  1553  	CMPQ	AX, CX
  1554  	JNE	diff8
  1555  _0through8:
  1556  	MOVQ	-8(SI)(R8*1), AX
  1557  	MOVQ	-8(DI)(R8*1), CX
  1558  	CMPQ	AX, CX
  1559  	JEQ	allsame
  1560  
  1561  	// AX and CX contain parts of a and b that differ.
  1562  diff8:
  1563  	BSWAPQ	AX	// reverse order of bytes
  1564  	BSWAPQ	CX
  1565  	XORQ	AX, CX
  1566  	BSRQ	CX, CX	// index of highest bit difference
  1567  	SHRQ	CX, AX	// move a's bit to bottom
  1568  	ANDQ	$1, AX	// mask bit
  1569  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1570  	MOVQ	AX, (R9)
  1571  	RET
  1572  
  1573  	// 0-7 bytes in common
  1574  small:
  1575  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1576  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1577  	JEQ	allsame
  1578  
  1579  	// load bytes of a into high bytes of AX
  1580  	CMPB	SI, $0xf8
  1581  	JA	si_high
  1582  	MOVQ	(SI), SI
  1583  	JMP	si_finish
  1584  si_high:
  1585  	MOVQ	-8(SI)(R8*1), SI
  1586  	SHRQ	CX, SI
  1587  si_finish:
  1588  	SHLQ	CX, SI
  1589  
  1590  	// load bytes of b in to high bytes of BX
  1591  	CMPB	DI, $0xf8
  1592  	JA	di_high
  1593  	MOVQ	(DI), DI
  1594  	JMP	di_finish
  1595  di_high:
  1596  	MOVQ	-8(DI)(R8*1), DI
  1597  	SHRQ	CX, DI
  1598  di_finish:
  1599  	SHLQ	CX, DI
  1600  
  1601  	BSWAPQ	SI	// reverse order of bytes
  1602  	BSWAPQ	DI
  1603  	XORQ	SI, DI	// find bit differences
  1604  	JEQ	allsame
  1605  	BSRQ	DI, CX	// index of highest bit difference
  1606  	SHRQ	CX, SI	// move a's bit to bottom
  1607  	ANDQ	$1, SI	// mask bit
  1608  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1609  	MOVQ	AX, (R9)
  1610  	RET
  1611  
  1612  allsame:
  1613  	XORQ	AX, AX
  1614  	XORQ	CX, CX
  1615  	CMPQ	BX, DX
  1616  	SETGT	AX	// 1 if alen > blen
  1617  	SETEQ	CX	// 1 if alen == blen
  1618  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1619  	MOVQ	AX, (R9)
  1620  	RET
  1621  
  1622  	// this works for >= 64 bytes of data.
  1623  big_loop:
  1624  	MOVOU	(SI), X0
  1625  	MOVOU	(DI), X1
  1626  	PCMPEQB X0, X1
  1627  	PMOVMSKB X1, AX
  1628  	XORQ	$0xffff, AX
  1629  	JNE	diff16
  1630  
  1631  	MOVOU	16(SI), X0
  1632  	MOVOU	16(DI), X1
  1633  	PCMPEQB X0, X1
  1634  	PMOVMSKB X1, AX
  1635  	XORQ	$0xffff, AX
  1636  	JNE	diff32
  1637  
  1638  	MOVOU	32(SI), X0
  1639  	MOVOU	32(DI), X1
  1640  	PCMPEQB X0, X1
  1641  	PMOVMSKB X1, AX
  1642  	XORQ	$0xffff, AX
  1643  	JNE	diff48
  1644  
  1645  	MOVOU	48(SI), X0
  1646  	MOVOU	48(DI), X1
  1647  	PCMPEQB X0, X1
  1648  	PMOVMSKB X1, AX
  1649  	XORQ	$0xffff, AX
  1650  	JNE	diff64
  1651  
  1652  	ADDQ	$64, SI
  1653  	ADDQ	$64, DI
  1654  	SUBQ	$64, R8
  1655  	CMPQ	R8, $64
  1656  	JBE	loop
  1657  	JMP	big_loop
  1658  
  1659  	// Compare 64-bytes per loop iteration.
  1660  	// Loop is unrolled and uses AVX2.
  1661  big_loop_avx2:
  1662  	VMOVDQU	(SI), Y2
  1663  	VMOVDQU	(DI), Y3
  1664  	VMOVDQU	32(SI), Y4
  1665  	VMOVDQU	32(DI), Y5
  1666  	VPCMPEQB Y2, Y3, Y0
  1667  	VPMOVMSKB Y0, AX
  1668  	XORL	$0xffffffff, AX
  1669  	JNE	diff32_avx2
  1670  	VPCMPEQB Y4, Y5, Y6
  1671  	VPMOVMSKB Y6, AX
  1672  	XORL	$0xffffffff, AX
  1673  	JNE	diff64_avx2
  1674  
  1675  	ADDQ	$64, SI
  1676  	ADDQ	$64, DI
  1677  	SUBQ	$64, R8
  1678  	CMPQ	R8, $64
  1679  	JB	big_loop_avx2_exit
  1680  	JMP	big_loop_avx2
  1681  
  1682  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1683  diff32_avx2:
  1684  	VZEROUPPER
  1685  	JMP diff16
  1686  
  1687  	// Same as diff32_avx2, but for last 32 bytes.
  1688  diff64_avx2:
  1689  	VZEROUPPER
  1690  	JMP diff48
  1691  
  1692  	// For <64 bytes remainder jump to normal loop.
  1693  big_loop_avx2_exit:
  1694  	VZEROUPPER
  1695  	JMP loop
  1696  
  1697  
  1698  // TODO: Also use this in bytes.Index
  1699  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1700  	MOVQ s+0(FP), DI
  1701  	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1702  	MOVQ s_len+8(FP), DX
  1703  	MOVQ c+16(FP), BP
  1704  	MOVQ c_len+24(FP), AX
  1705  	CMPQ AX, DX
  1706  	JA fail
  1707  	CMPQ DX, $16
  1708  	JAE sse42
  1709  no_sse42:
  1710  	CMPQ AX, $2
  1711  	JA   _3_or_more
  1712  	MOVW (BP), BP
  1713  	LEAQ -1(DI)(DX*1), DX
  1714  loop2:
  1715  	MOVW (DI), SI
  1716  	CMPW SI,BP
  1717  	JZ success
  1718  	ADDQ $1,DI
  1719  	CMPQ DI,DX
  1720  	JB loop2
  1721  	JMP fail
  1722  _3_or_more:
  1723  	CMPQ AX, $3
  1724  	JA   _4_or_more
  1725  	MOVW 1(BP), BX
  1726  	MOVW (BP), BP
  1727  	LEAQ -2(DI)(DX*1), DX
  1728  loop3:
  1729  	MOVW (DI), SI
  1730  	CMPW SI,BP
  1731  	JZ   partial_success3
  1732  	ADDQ $1,DI
  1733  	CMPQ DI,DX
  1734  	JB loop3
  1735  	JMP fail
  1736  partial_success3:
  1737  	MOVW 1(DI), SI
  1738  	CMPW SI,BX
  1739  	JZ success
  1740  	ADDQ $1,DI
  1741  	CMPQ DI,DX
  1742  	JB loop3
  1743  	JMP fail
  1744  _4_or_more:
  1745  	CMPQ AX, $4
  1746  	JA   _5_or_more
  1747  	MOVL (BP), BP
  1748  	LEAQ -3(DI)(DX*1), DX
  1749  loop4:
  1750  	MOVL (DI), SI
  1751  	CMPL SI,BP
  1752  	JZ   success
  1753  	ADDQ $1,DI
  1754  	CMPQ DI,DX
  1755  	JB loop4
  1756  	JMP fail
  1757  _5_or_more:
  1758  	CMPQ AX, $7
  1759  	JA   _8_or_more
  1760  	LEAQ 1(DI)(DX*1), DX
  1761  	SUBQ AX, DX
  1762  	MOVL -4(BP)(AX*1), BX
  1763  	MOVL (BP), BP
  1764  loop5to7:
  1765  	MOVL (DI), SI
  1766  	CMPL SI,BP
  1767  	JZ   partial_success5to7
  1768  	ADDQ $1,DI
  1769  	CMPQ DI,DX
  1770  	JB loop5to7
  1771  	JMP fail
  1772  partial_success5to7:
  1773  	MOVL -4(AX)(DI*1), SI
  1774  	CMPL SI,BX
  1775  	JZ success
  1776  	ADDQ $1,DI
  1777  	CMPQ DI,DX
  1778  	JB loop5to7
  1779  	JMP fail
  1780  _8_or_more:
  1781  	CMPQ AX, $8
  1782  	JA   _9_or_more
  1783  	MOVQ (BP), BP
  1784  	LEAQ -7(DI)(DX*1), DX
  1785  loop8:
  1786  	MOVQ (DI), SI
  1787  	CMPQ SI,BP
  1788  	JZ   success
  1789  	ADDQ $1,DI
  1790  	CMPQ DI,DX
  1791  	JB loop8
  1792  	JMP fail
  1793  _9_or_more:
  1794  	CMPQ AX, $16
  1795  	JA   _16_or_more
  1796  	LEAQ 1(DI)(DX*1), DX
  1797  	SUBQ AX, DX
  1798  	MOVQ -8(BP)(AX*1), BX
  1799  	MOVQ (BP), BP
  1800  loop9to15:
  1801  	MOVQ (DI), SI
  1802  	CMPQ SI,BP
  1803  	JZ   partial_success9to15
  1804  	ADDQ $1,DI
  1805  	CMPQ DI,DX
  1806  	JB loop9to15
  1807  	JMP fail
  1808  partial_success9to15:
  1809  	MOVQ -8(AX)(DI*1), SI
  1810  	CMPQ SI,BX
  1811  	JZ success
  1812  	ADDQ $1,DI
  1813  	CMPQ DI,DX
  1814  	JB loop9to15
  1815  	JMP fail
  1816  _16_or_more:
  1817  	CMPQ AX, $16
  1818  	JA   _17_to_31
  1819  	MOVOU (BP), X1
  1820  	LEAQ -15(DI)(DX*1), DX
  1821  loop16:
  1822  	MOVOU (DI), X2
  1823  	PCMPEQB X1, X2
  1824  	PMOVMSKB X2, SI
  1825  	CMPQ  SI, $0xffff
  1826  	JE   success
  1827  	ADDQ $1,DI
  1828  	CMPQ DI,DX
  1829  	JB loop16
  1830  	JMP fail
  1831  _17_to_31:
  1832  	LEAQ 1(DI)(DX*1), DX
  1833  	SUBQ AX, DX
  1834  	MOVOU -16(BP)(AX*1), X0
  1835  	MOVOU (BP), X1
  1836  loop17to31:
  1837  	MOVOU (DI), X2
  1838  	PCMPEQB X1,X2
  1839  	PMOVMSKB X2, SI
  1840  	CMPQ  SI, $0xffff
  1841  	JE   partial_success17to31
  1842  	ADDQ $1,DI
  1843  	CMPQ DI,DX
  1844  	JB loop17to31
  1845  	JMP fail
  1846  partial_success17to31:
  1847  	MOVOU -16(AX)(DI*1), X3
  1848  	PCMPEQB X0, X3
  1849  	PMOVMSKB X3, SI
  1850  	CMPQ  SI, $0xffff
  1851  	JE success
  1852  	ADDQ $1,DI
  1853  	CMPQ DI,DX
  1854  	JB loop17to31
  1855  fail:
  1856  	MOVQ $-1, ret+32(FP)
  1857  	RET
  1858  sse42:
  1859  	MOVL runtime·cpuid_ecx(SB), CX
  1860  	ANDL $0x100000, CX
  1861  	JZ no_sse42
  1862  	CMPQ AX, $12
  1863  	// PCMPESTRI is slower than normal compare,
  1864  	// so using it makes sense only if we advance 4+ bytes per compare
  1865  	// This value was determined experimentally and is the ~same
  1866  	// on Nehalem (first with SSE42) and Haswell.
  1867  	JAE _9_or_more
  1868  	LEAQ 16(BP), SI
  1869  	TESTW $0xff0, SI
  1870  	JEQ no_sse42
  1871  	MOVOU (BP), X1
  1872  	LEAQ -15(DI)(DX*1), SI
  1873  	MOVQ $16, R9
  1874  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1875  loop_sse42:
  1876  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1877  	// for equality (bits 2,3 are 11)
  1878  	// result is not masked or inverted (bits 4,5 are 00)
  1879  	// and corresponds to first matching byte (bit 6 is 0)
  1880  	PCMPESTRI $0x0c, (DI), X1
  1881  	// CX == 16 means no match,
  1882  	// CX > R9 means partial match at the end of the string,
  1883  	// otherwise sep is at offset CX from X1 start
  1884  	CMPQ CX, R9
  1885  	JBE sse42_success
  1886  	ADDQ R9, DI
  1887  	CMPQ DI, SI
  1888  	JB loop_sse42
  1889  	PCMPESTRI $0x0c, -1(SI), X1
  1890  	CMPQ CX, R9
  1891  	JA fail
  1892  	LEAQ -1(SI), DI
  1893  sse42_success:
  1894  	ADDQ CX, DI
  1895  success:
  1896  	SUBQ s+0(FP), DI
  1897  	MOVQ DI, ret+32(FP)
  1898  	RET
  1899  
  1900  
  1901  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1902  	MOVQ s+0(FP), SI
  1903  	MOVQ s_len+8(FP), BX
  1904  	MOVB c+24(FP), AL
  1905  	LEAQ ret+32(FP), R8
  1906  	JMP  runtime·indexbytebody(SB)
  1907  
  1908  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1909  	MOVQ s+0(FP), SI
  1910  	MOVQ s_len+8(FP), BX
  1911  	MOVB c+16(FP), AL
  1912  	LEAQ ret+24(FP), R8
  1913  	JMP  runtime·indexbytebody(SB)
  1914  
  1915  // input:
  1916  //   SI: data
  1917  //   BX: data len
  1918  //   AL: byte sought
  1919  //   R8: address to put result
  1920  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1921  	// Shuffle X0 around so that each byte contains
  1922  	// the character we're looking for.
  1923  	MOVD AX, X0
  1924  	PUNPCKLBW X0, X0
  1925  	PUNPCKLBW X0, X0
  1926  	PSHUFL $0, X0, X0
  1927  	
  1928  	CMPQ BX, $16
  1929  	JLT small
  1930  
  1931  	MOVQ SI, DI
  1932  
  1933  	CMPQ BX, $32
  1934  	JA avx2
  1935  sse:
  1936  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  1937  	JMP	sseloopentry
  1938  	
  1939  sseloop:
  1940  	// Move the next 16-byte chunk of the data into X1.
  1941  	MOVOU	(DI), X1
  1942  	// Compare bytes in X0 to X1.
  1943  	PCMPEQB	X0, X1
  1944  	// Take the top bit of each byte in X1 and put the result in DX.
  1945  	PMOVMSKB X1, DX
  1946  	// Find first set bit, if any.
  1947  	BSFL	DX, DX
  1948  	JNZ	ssesuccess
  1949  	// Advance to next block.
  1950  	ADDQ	$16, DI
  1951  sseloopentry:
  1952  	CMPQ	DI, AX
  1953  	JB	sseloop
  1954  
  1955  	// Search the last 16-byte chunk. This chunk may overlap with the
  1956  	// chunks we've already searched, but that's ok.
  1957  	MOVQ	AX, DI
  1958  	MOVOU	(AX), X1
  1959  	PCMPEQB	X0, X1
  1960  	PMOVMSKB X1, DX
  1961  	BSFL	DX, DX
  1962  	JNZ	ssesuccess
  1963  
  1964  failure:
  1965  	MOVQ $-1, (R8)
  1966  	RET
  1967  
  1968  // We've found a chunk containing the byte.
  1969  // The chunk was loaded from DI.
  1970  // The index of the matching byte in the chunk is DX.
  1971  // The start of the data is SI.
  1972  ssesuccess:
  1973  	SUBQ SI, DI	// Compute offset of chunk within data.
  1974  	ADDQ DX, DI	// Add offset of byte within chunk.
  1975  	MOVQ DI, (R8)
  1976  	RET
  1977  
  1978  // handle for lengths < 16
  1979  small:
  1980  	TESTQ	BX, BX
  1981  	JEQ	failure
  1982  
  1983  	// Check if we'll load across a page boundary.
  1984  	LEAQ	16(SI), AX
  1985  	TESTW	$0xff0, AX
  1986  	JEQ	endofpage
  1987  
  1988  	MOVOU	(SI), X1 // Load data
  1989  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  1990  	PMOVMSKB X1, DX	// Move result bits to integer register.
  1991  	BSFL	DX, DX	// Find first set bit.
  1992  	JZ	failure	// No set bit, failure.
  1993  	CMPL	DX, BX
  1994  	JAE	failure	// Match is past end of data.
  1995  	MOVQ	DX, (R8)
  1996  	RET
  1997  
  1998  endofpage:
  1999  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2000  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2001  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2002  	MOVL	BX, CX
  2003  	SHLL	CX, DX
  2004  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2005  	BSFL	DX, DX	// Find first set bit.
  2006  	JZ	failure	// No set bit, failure.
  2007  	MOVQ	DX, (R8)
  2008  	RET
  2009  
  2010  avx2:
  2011  	CMPB   runtime·support_avx2(SB), $1
  2012  	JNE sse
  2013  	MOVD AX, X0
  2014  	LEAQ -32(SI)(BX*1), R11
  2015  	VPBROADCASTB  X0, Y1
  2016  avx2_loop:
  2017  	VMOVDQU (DI), Y2
  2018  	VPCMPEQB Y1, Y2, Y3
  2019  	VPTEST Y3, Y3
  2020  	JNZ avx2success
  2021  	ADDQ $32, DI
  2022  	CMPQ DI, R11
  2023  	JLT avx2_loop
  2024  	MOVQ R11, DI
  2025  	VMOVDQU (DI), Y2
  2026  	VPCMPEQB Y1, Y2, Y3
  2027  	VPTEST Y3, Y3
  2028  	JNZ avx2success
  2029  	VZEROUPPER
  2030  	MOVQ $-1, (R8)
  2031  	RET
  2032  
  2033  avx2success:
  2034  	VPMOVMSKB Y3, DX
  2035  	BSFL DX, DX
  2036  	SUBQ SI, DI
  2037  	ADDQ DI, DX
  2038  	MOVQ DX, (R8)
  2039  	VZEROUPPER
  2040  	RET
  2041  
  2042  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2043  	MOVQ	a_len+8(FP), BX
  2044  	MOVQ	b_len+32(FP), CX
  2045  	CMPQ	BX, CX
  2046  	JNE	eqret
  2047  	MOVQ	a+0(FP), SI
  2048  	MOVQ	b+24(FP), DI
  2049  	LEAQ	ret+48(FP), AX
  2050  	JMP	runtime·memeqbody(SB)
  2051  eqret:
  2052  	MOVB	$0, ret+48(FP)
  2053  	RET
  2054  
  2055  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  2056  	get_tls(CX)
  2057  	MOVQ	g(CX), AX
  2058  	MOVQ	g_m(AX), AX
  2059  	MOVL	m_fastrand(AX), DX
  2060  	ADDL	DX, DX
  2061  	MOVL	DX, BX
  2062  	XORL	$0x88888eef, DX
  2063  	CMOVLMI	BX, DX
  2064  	MOVL	DX, m_fastrand(AX)
  2065  	MOVL	DX, ret+0(FP)
  2066  	RET
  2067  
  2068  TEXT runtime·return0(SB), NOSPLIT, $0
  2069  	MOVL	$0, AX
  2070  	RET
  2071  
  2072  
  2073  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2074  // Must obey the gcc calling convention.
  2075  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2076  	get_tls(CX)
  2077  	MOVQ	g(CX), AX
  2078  	MOVQ	g_m(AX), AX
  2079  	MOVQ	m_curg(AX), AX
  2080  	MOVQ	(g_stack+stack_hi)(AX), AX
  2081  	RET
  2082  
  2083  // The top-most function running on a goroutine
  2084  // returns to goexit+PCQuantum.
  2085  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2086  	BYTE	$0x90	// NOP
  2087  	CALL	runtime·goexit1(SB)	// does not return
  2088  	// traceback from goexit1 must hit code range of goexit
  2089  	BYTE	$0x90	// NOP
  2090  
  2091  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2092  	MOVQ	addr+0(FP), AX
  2093  	PREFETCHT0	(AX)
  2094  	RET
  2095  
  2096  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2097  	MOVQ	addr+0(FP), AX
  2098  	PREFETCHT1	(AX)
  2099  	RET
  2100  
  2101  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2102  	MOVQ	addr+0(FP), AX
  2103  	PREFETCHT2	(AX)
  2104  	RET
  2105  
  2106  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2107  	MOVQ	addr+0(FP), AX
  2108  	PREFETCHNTA	(AX)
  2109  	RET
  2110  
  2111  // This is called from .init_array and follows the platform, not Go, ABI.
  2112  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2113  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2114  	MOVQ	runtime·lastmoduledatap(SB), AX
  2115  	MOVQ	DI, moduledata_next(AX)
  2116  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2117  	POPQ	R15
  2118  	RET