github.com/miolini/go@v0.0.0-20160405192216-fca68c8cb408/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	MOVQ	AX, SI
    32  	CMPQ	AX, $0
    33  	JE	nocpuinfo
    34  
    35  	// Figure out how to serialize RDTSC.
    36  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37  	// Don't know about the rest, so let's do MFENCE.
    38  	CMPL	BX, $0x756E6547  // "Genu"
    39  	JNE	notintel
    40  	CMPL	DX, $0x49656E69  // "ineI"
    41  	JNE	notintel
    42  	CMPL	CX, $0x6C65746E  // "ntel"
    43  	JNE	notintel
    44  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    45  notintel:
    46  
    47  	// Load EAX=1 cpuid flags
    48  	MOVQ	$1, AX
    49  	CPUID
    50  	MOVL	CX, runtime·cpuid_ecx(SB)
    51  	MOVL	DX, runtime·cpuid_edx(SB)
    52  
    53  	// Load EAX=7/ECX=0 cpuid flags
    54  	CMPQ	SI, $7
    55  	JLT	no7
    56  	MOVL	$7, AX
    57  	MOVL	$0, CX
    58  	CPUID
    59  	MOVL	BX, runtime·cpuid_ebx7(SB)
    60  no7:
    61  	// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
    62  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
    63  	// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
    64  	MOVL	runtime·cpuid_ecx(SB), CX
    65  	ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
    66  	CMPL    CX, $0x18000000
    67  	JNE     noavx
    68  	MOVL    $0, CX
    69  	// For XGETBV, OSXSAVE bit is required and sufficient
    70  	XGETBV
    71  	ANDL    $6, AX
    72  	CMPL    AX, $6 // Check for OS support of YMM registers
    73  	JNE     noavx
    74  	MOVB    $1, runtime·support_avx(SB)
    75  	TESTL   $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
    76  	JEQ     noavx2
    77  	MOVB    $1, runtime·support_avx2(SB)
    78  	JMP     nocpuinfo
    79  noavx:
    80  	MOVB    $0, runtime·support_avx(SB)
    81  noavx2:
    82  	MOVB    $0, runtime·support_avx2(SB)
    83  nocpuinfo:	
    84  	
    85  	// if there is an _cgo_init, call it.
    86  	MOVQ	_cgo_init(SB), AX
    87  	TESTQ	AX, AX
    88  	JZ	needtls
    89  	// g0 already in DI
    90  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    91  	MOVQ	$setg_gcc<>(SB), SI
    92  	CALL	AX
    93  
    94  	// update stackguard after _cgo_init
    95  	MOVQ	$runtime·g0(SB), CX
    96  	MOVQ	(g_stack+stack_lo)(CX), AX
    97  	ADDQ	$const__StackGuard, AX
    98  	MOVQ	AX, g_stackguard0(CX)
    99  	MOVQ	AX, g_stackguard1(CX)
   100  
   101  #ifndef GOOS_windows
   102  	JMP ok
   103  #endif
   104  needtls:
   105  #ifdef GOOS_plan9
   106  	// skip TLS setup on Plan 9
   107  	JMP ok
   108  #endif
   109  #ifdef GOOS_solaris
   110  	// skip TLS setup on Solaris
   111  	JMP ok
   112  #endif
   113  
   114  	LEAQ	runtime·m0+m_tls(SB), DI
   115  	CALL	runtime·settls(SB)
   116  
   117  	// store through it, to make sure it works
   118  	get_tls(BX)
   119  	MOVQ	$0x123, g(BX)
   120  	MOVQ	runtime·m0+m_tls(SB), AX
   121  	CMPQ	AX, $0x123
   122  	JEQ 2(PC)
   123  	MOVL	AX, 0	// abort
   124  ok:
   125  	// set the per-goroutine and per-mach "registers"
   126  	get_tls(BX)
   127  	LEAQ	runtime·g0(SB), CX
   128  	MOVQ	CX, g(BX)
   129  	LEAQ	runtime·m0(SB), AX
   130  
   131  	// save m->g0 = g0
   132  	MOVQ	CX, m_g0(AX)
   133  	// save m0 to g0->m
   134  	MOVQ	AX, g_m(CX)
   135  
   136  	CLD				// convention is D is always left cleared
   137  	CALL	runtime·check(SB)
   138  
   139  	MOVL	16(SP), AX		// copy argc
   140  	MOVL	AX, 0(SP)
   141  	MOVQ	24(SP), AX		// copy argv
   142  	MOVQ	AX, 8(SP)
   143  	CALL	runtime·args(SB)
   144  	CALL	runtime·osinit(SB)
   145  	CALL	runtime·schedinit(SB)
   146  
   147  	// create a new goroutine to start program
   148  	MOVQ	$runtime·mainPC(SB), AX		// entry
   149  	PUSHQ	AX
   150  	PUSHQ	$0			// arg size
   151  	CALL	runtime·newproc(SB)
   152  	POPQ	AX
   153  	POPQ	AX
   154  
   155  	// start this M
   156  	CALL	runtime·mstart(SB)
   157  
   158  	MOVL	$0xf1, 0xf1  // crash
   159  	RET
   160  
   161  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   162  GLOBL	runtime·mainPC(SB),RODATA,$8
   163  
   164  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   165  	BYTE	$0xcc
   166  	RET
   167  
   168  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   169  	// No per-thread init.
   170  	RET
   171  
   172  /*
   173   *  go-routine
   174   */
   175  
   176  // void gosave(Gobuf*)
   177  // save state in Gobuf; setjmp
   178  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   179  	MOVQ	buf+0(FP), AX		// gobuf
   180  	LEAQ	buf+0(FP), BX		// caller's SP
   181  	MOVQ	BX, gobuf_sp(AX)
   182  	MOVQ	0(SP), BX		// caller's PC
   183  	MOVQ	BX, gobuf_pc(AX)
   184  	MOVQ	$0, gobuf_ret(AX)
   185  	MOVQ	$0, gobuf_ctxt(AX)
   186  	MOVQ	BP, gobuf_bp(AX)
   187  	get_tls(CX)
   188  	MOVQ	g(CX), BX
   189  	MOVQ	BX, gobuf_g(AX)
   190  	RET
   191  
   192  // void gogo(Gobuf*)
   193  // restore state from Gobuf; longjmp
   194  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   195  	MOVQ	buf+0(FP), BX		// gobuf
   196  	MOVQ	gobuf_g(BX), DX
   197  	MOVQ	0(DX), CX		// make sure g != nil
   198  	get_tls(CX)
   199  	MOVQ	DX, g(CX)
   200  	MOVQ	gobuf_sp(BX), SP	// restore SP
   201  	MOVQ	gobuf_ret(BX), AX
   202  	MOVQ	gobuf_ctxt(BX), DX
   203  	MOVQ	gobuf_bp(BX), BP
   204  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   205  	MOVQ	$0, gobuf_ret(BX)
   206  	MOVQ	$0, gobuf_ctxt(BX)
   207  	MOVQ	$0, gobuf_bp(BX)
   208  	MOVQ	gobuf_pc(BX), BX
   209  	JMP	BX
   210  
   211  // func mcall(fn func(*g))
   212  // Switch to m->g0's stack, call fn(g).
   213  // Fn must never return. It should gogo(&g->sched)
   214  // to keep running g.
   215  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   216  	MOVQ	fn+0(FP), DI
   217  	
   218  	get_tls(CX)
   219  	MOVQ	g(CX), AX	// save state in g->sched
   220  	MOVQ	0(SP), BX	// caller's PC
   221  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   222  	LEAQ	fn+0(FP), BX	// caller's SP
   223  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   224  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   225  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   226  
   227  	// switch to m->g0 & its stack, call fn
   228  	MOVQ	g(CX), BX
   229  	MOVQ	g_m(BX), BX
   230  	MOVQ	m_g0(BX), SI
   231  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   232  	JNE	3(PC)
   233  	MOVQ	$runtime·badmcall(SB), AX
   234  	JMP	AX
   235  	MOVQ	SI, g(CX)	// g = m->g0
   236  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   237  	PUSHQ	AX
   238  	MOVQ	DI, DX
   239  	MOVQ	0(DI), DI
   240  	CALL	DI
   241  	POPQ	AX
   242  	MOVQ	$runtime·badmcall2(SB), AX
   243  	JMP	AX
   244  	RET
   245  
   246  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   247  // of the G stack. We need to distinguish the routine that
   248  // lives at the bottom of the G stack from the one that lives
   249  // at the top of the system stack because the one at the top of
   250  // the system stack terminates the stack walk (see topofstack()).
   251  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   252  	RET
   253  
   254  // func systemstack(fn func())
   255  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   256  	MOVQ	fn+0(FP), DI	// DI = fn
   257  	get_tls(CX)
   258  	MOVQ	g(CX), AX	// AX = g
   259  	MOVQ	g_m(AX), BX	// BX = m
   260  
   261  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   262  	CMPQ	AX, DX
   263  	JEQ	noswitch
   264  
   265  	MOVQ	m_g0(BX), DX	// DX = g0
   266  	CMPQ	AX, DX
   267  	JEQ	noswitch
   268  
   269  	MOVQ	m_curg(BX), R8
   270  	CMPQ	AX, R8
   271  	JEQ	switch
   272  	
   273  	// Bad: g is not gsignal, not g0, not curg. What is it?
   274  	MOVQ	$runtime·badsystemstack(SB), AX
   275  	CALL	AX
   276  
   277  switch:
   278  	// save our state in g->sched. Pretend to
   279  	// be systemstack_switch if the G stack is scanned.
   280  	MOVQ	$runtime·systemstack_switch(SB), SI
   281  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   282  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   283  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   284  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   285  
   286  	// switch to g0
   287  	MOVQ	DX, g(CX)
   288  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   289  	// make it look like mstart called systemstack on g0, to stop traceback
   290  	SUBQ	$8, BX
   291  	MOVQ	$runtime·mstart(SB), DX
   292  	MOVQ	DX, 0(BX)
   293  	MOVQ	BX, SP
   294  
   295  	// call target function
   296  	MOVQ	DI, DX
   297  	MOVQ	0(DI), DI
   298  	CALL	DI
   299  
   300  	// switch back to g
   301  	get_tls(CX)
   302  	MOVQ	g(CX), AX
   303  	MOVQ	g_m(AX), BX
   304  	MOVQ	m_curg(BX), AX
   305  	MOVQ	AX, g(CX)
   306  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   307  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   308  	RET
   309  
   310  noswitch:
   311  	// already on m stack, just call directly
   312  	MOVQ	DI, DX
   313  	MOVQ	0(DI), DI
   314  	CALL	DI
   315  	RET
   316  
   317  /*
   318   * support for morestack
   319   */
   320  
   321  // Called during function prolog when more stack is needed.
   322  //
   323  // The traceback routines see morestack on a g0 as being
   324  // the top of a stack (for example, morestack calling newstack
   325  // calling the scheduler calling newm calling gc), so we must
   326  // record an argument size. For that purpose, it has no arguments.
   327  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   328  	// Cannot grow scheduler stack (m->g0).
   329  	get_tls(CX)
   330  	MOVQ	g(CX), BX
   331  	MOVQ	g_m(BX), BX
   332  	MOVQ	m_g0(BX), SI
   333  	CMPQ	g(CX), SI
   334  	JNE	2(PC)
   335  	INT	$3
   336  
   337  	// Cannot grow signal stack (m->gsignal).
   338  	MOVQ	m_gsignal(BX), SI
   339  	CMPQ	g(CX), SI
   340  	JNE	2(PC)
   341  	INT	$3
   342  
   343  	// Called from f.
   344  	// Set m->morebuf to f's caller.
   345  	MOVQ	8(SP), AX	// f's caller's PC
   346  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   347  	LEAQ	16(SP), AX	// f's caller's SP
   348  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   349  	get_tls(CX)
   350  	MOVQ	g(CX), SI
   351  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   352  
   353  	// Set g->sched to context in f.
   354  	MOVQ	0(SP), AX // f's PC
   355  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   356  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   357  	LEAQ	8(SP), AX // f's SP
   358  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   359  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   360  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   361  
   362  	// Call newstack on m->g0's stack.
   363  	MOVQ	m_g0(BX), BX
   364  	MOVQ	BX, g(CX)
   365  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   366  	CALL	runtime·newstack(SB)
   367  	MOVQ	$0, 0x1003	// crash if newstack returns
   368  	RET
   369  
   370  // morestack but not preserving ctxt.
   371  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   372  	MOVL	$0, DX
   373  	JMP	runtime·morestack(SB)
   374  
   375  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   376  	// We came here via a RET to an overwritten return PC.
   377  	// AX may be live. Other registers are available.
   378  
   379  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   380  	get_tls(CX)
   381  	MOVQ	g(CX), CX
   382  	MOVQ	(g_stkbar+slice_array)(CX), DX
   383  	MOVQ	g_stkbarPos(CX), BX
   384  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   385  	MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
   386  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   387  	// Assert that we're popping the right saved LR.
   388  	ADDQ	$8, R8
   389  	CMPQ	R8, SP
   390  	JEQ	2(PC)
   391  	MOVL	$0, 0
   392  	// Record that this stack barrier was hit.
   393  	ADDQ	$1, g_stkbarPos(CX)
   394  	// Jump to the original return PC.
   395  	JMP	BX
   396  
   397  // reflectcall: call a function with the given argument list
   398  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   399  // we don't have variable-sized frames, so we use a small number
   400  // of constant-sized-frame functions to encode a few bits of size in the pc.
   401  // Caution: ugly multiline assembly macros in your future!
   402  
   403  #define DISPATCH(NAME,MAXSIZE)		\
   404  	CMPQ	CX, $MAXSIZE;		\
   405  	JA	3(PC);			\
   406  	MOVQ	$NAME(SB), AX;		\
   407  	JMP	AX
   408  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   409  
   410  TEXT reflect·call(SB), NOSPLIT, $0-0
   411  	JMP	·reflectcall(SB)
   412  
   413  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   414  	MOVLQZX argsize+24(FP), CX
   415  	// NOTE(rsc): No call16, because CALLFN needs four words
   416  	// of argument space to invoke callwritebarrier.
   417  	DISPATCH(runtime·call32, 32)
   418  	DISPATCH(runtime·call64, 64)
   419  	DISPATCH(runtime·call128, 128)
   420  	DISPATCH(runtime·call256, 256)
   421  	DISPATCH(runtime·call512, 512)
   422  	DISPATCH(runtime·call1024, 1024)
   423  	DISPATCH(runtime·call2048, 2048)
   424  	DISPATCH(runtime·call4096, 4096)
   425  	DISPATCH(runtime·call8192, 8192)
   426  	DISPATCH(runtime·call16384, 16384)
   427  	DISPATCH(runtime·call32768, 32768)
   428  	DISPATCH(runtime·call65536, 65536)
   429  	DISPATCH(runtime·call131072, 131072)
   430  	DISPATCH(runtime·call262144, 262144)
   431  	DISPATCH(runtime·call524288, 524288)
   432  	DISPATCH(runtime·call1048576, 1048576)
   433  	DISPATCH(runtime·call2097152, 2097152)
   434  	DISPATCH(runtime·call4194304, 4194304)
   435  	DISPATCH(runtime·call8388608, 8388608)
   436  	DISPATCH(runtime·call16777216, 16777216)
   437  	DISPATCH(runtime·call33554432, 33554432)
   438  	DISPATCH(runtime·call67108864, 67108864)
   439  	DISPATCH(runtime·call134217728, 134217728)
   440  	DISPATCH(runtime·call268435456, 268435456)
   441  	DISPATCH(runtime·call536870912, 536870912)
   442  	DISPATCH(runtime·call1073741824, 1073741824)
   443  	MOVQ	$runtime·badreflectcall(SB), AX
   444  	JMP	AX
   445  
   446  #define CALLFN(NAME,MAXSIZE)			\
   447  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   448  	NO_LOCAL_POINTERS;			\
   449  	/* copy arguments to stack */		\
   450  	MOVQ	argptr+16(FP), SI;		\
   451  	MOVLQZX argsize+24(FP), CX;		\
   452  	MOVQ	SP, DI;				\
   453  	REP;MOVSB;				\
   454  	/* call function */			\
   455  	MOVQ	f+8(FP), DX;			\
   456  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   457  	CALL	(DX);				\
   458  	/* copy return values back */		\
   459  	MOVQ	argptr+16(FP), DI;		\
   460  	MOVLQZX	argsize+24(FP), CX;		\
   461  	MOVLQZX retoffset+28(FP), BX;		\
   462  	MOVQ	SP, SI;				\
   463  	ADDQ	BX, DI;				\
   464  	ADDQ	BX, SI;				\
   465  	SUBQ	BX, CX;				\
   466  	REP;MOVSB;				\
   467  	/* execute write barrier updates */	\
   468  	MOVQ	argtype+0(FP), DX;		\
   469  	MOVQ	argptr+16(FP), DI;		\
   470  	MOVLQZX	argsize+24(FP), CX;		\
   471  	MOVLQZX retoffset+28(FP), BX;		\
   472  	MOVQ	DX, 0(SP);			\
   473  	MOVQ	DI, 8(SP);			\
   474  	MOVQ	CX, 16(SP);			\
   475  	MOVQ	BX, 24(SP);			\
   476  	CALL	runtime·callwritebarrier(SB);	\
   477  	RET
   478  
   479  CALLFN(·call32, 32)
   480  CALLFN(·call64, 64)
   481  CALLFN(·call128, 128)
   482  CALLFN(·call256, 256)
   483  CALLFN(·call512, 512)
   484  CALLFN(·call1024, 1024)
   485  CALLFN(·call2048, 2048)
   486  CALLFN(·call4096, 4096)
   487  CALLFN(·call8192, 8192)
   488  CALLFN(·call16384, 16384)
   489  CALLFN(·call32768, 32768)
   490  CALLFN(·call65536, 65536)
   491  CALLFN(·call131072, 131072)
   492  CALLFN(·call262144, 262144)
   493  CALLFN(·call524288, 524288)
   494  CALLFN(·call1048576, 1048576)
   495  CALLFN(·call2097152, 2097152)
   496  CALLFN(·call4194304, 4194304)
   497  CALLFN(·call8388608, 8388608)
   498  CALLFN(·call16777216, 16777216)
   499  CALLFN(·call33554432, 33554432)
   500  CALLFN(·call67108864, 67108864)
   501  CALLFN(·call134217728, 134217728)
   502  CALLFN(·call268435456, 268435456)
   503  CALLFN(·call536870912, 536870912)
   504  CALLFN(·call1073741824, 1073741824)
   505  
   506  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   507  	MOVL	cycles+0(FP), AX
   508  again:
   509  	PAUSE
   510  	SUBL	$1, AX
   511  	JNZ	again
   512  	RET
   513  
   514  
   515  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   516  	// Stores are already ordered on x86, so this is just a
   517  	// compile barrier.
   518  	RET
   519  
   520  // void jmpdefer(fn, sp);
   521  // called from deferreturn.
   522  // 1. pop the caller
   523  // 2. sub 5 bytes from the callers return
   524  // 3. jmp to the argument
   525  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   526  	MOVQ	fv+0(FP), DX	// fn
   527  	MOVQ	argp+8(FP), BX	// caller sp
   528  	LEAQ	-8(BX), SP	// caller sp after CALL
   529  	SUBQ	$5, (SP)	// return to CALL again
   530  	MOVQ	0(DX), BX
   531  	JMP	BX	// but first run the deferred function
   532  
   533  // Save state of caller into g->sched. Smashes R8, R9.
   534  TEXT gosave<>(SB),NOSPLIT,$0
   535  	get_tls(R8)
   536  	MOVQ	g(R8), R8
   537  	MOVQ	0(SP), R9
   538  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   539  	LEAQ	8(SP), R9
   540  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   541  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   542  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   543  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   544  	RET
   545  
   546  // func asmcgocall(fn, arg unsafe.Pointer) int32
   547  // Call fn(arg) on the scheduler stack,
   548  // aligned appropriately for the gcc ABI.
   549  // See cgocall.go for more details.
   550  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   551  	MOVQ	fn+0(FP), AX
   552  	MOVQ	arg+8(FP), BX
   553  
   554  	MOVQ	SP, DX
   555  
   556  	// Figure out if we need to switch to m->g0 stack.
   557  	// We get called to create new OS threads too, and those
   558  	// come in on the m->g0 stack already.
   559  	get_tls(CX)
   560  	MOVQ	g(CX), R8
   561  	CMPQ	R8, $0
   562  	JEQ	nosave
   563  	MOVQ	g_m(R8), R8
   564  	MOVQ	m_g0(R8), SI
   565  	MOVQ	g(CX), DI
   566  	CMPQ	SI, DI
   567  	JEQ	nosave
   568  	MOVQ	m_gsignal(R8), SI
   569  	CMPQ	SI, DI
   570  	JEQ	nosave
   571  	
   572  	// Switch to system stack.
   573  	MOVQ	m_g0(R8), SI
   574  	CALL	gosave<>(SB)
   575  	MOVQ	SI, g(CX)
   576  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   577  
   578  	// Now on a scheduling stack (a pthread-created stack).
   579  	// Make sure we have enough room for 4 stack-backed fast-call
   580  	// registers as per windows amd64 calling convention.
   581  	SUBQ	$64, SP
   582  	ANDQ	$~15, SP	// alignment for gcc ABI
   583  	MOVQ	DI, 48(SP)	// save g
   584  	MOVQ	(g_stack+stack_hi)(DI), DI
   585  	SUBQ	DX, DI
   586  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   587  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   588  	MOVQ	BX, CX		// CX = first argument in Win64
   589  	CALL	AX
   590  
   591  	// Restore registers, g, stack pointer.
   592  	get_tls(CX)
   593  	MOVQ	48(SP), DI
   594  	MOVQ	(g_stack+stack_hi)(DI), SI
   595  	SUBQ	40(SP), SI
   596  	MOVQ	DI, g(CX)
   597  	MOVQ	SI, SP
   598  
   599  	MOVL	AX, ret+16(FP)
   600  	RET
   601  
   602  nosave:
   603  	// Running on a system stack, perhaps even without a g.
   604  	// Having no g can happen during thread creation or thread teardown
   605  	// (see needm/dropm on Solaris, for example).
   606  	// This code is like the above sequence but without saving/restoring g
   607  	// and without worrying about the stack moving out from under us
   608  	// (because we're on a system stack, not a goroutine stack).
   609  	// The above code could be used directly if already on a system stack,
   610  	// but then the only path through this code would be a rare case on Solaris.
   611  	// Using this code for all "already on system stack" calls exercises it more,
   612  	// which should help keep it correct.
   613  	SUBQ	$64, SP
   614  	ANDQ	$~15, SP
   615  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   616  	MOVQ	DX, 40(SP)	// save original stack pointer
   617  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   618  	MOVQ	BX, CX		// CX = first argument in Win64
   619  	CALL	AX
   620  	MOVQ	40(SP), SI	// restore original stack pointer
   621  	MOVQ	SI, SP
   622  	MOVL	AX, ret+16(FP)
   623  	RET
   624  
   625  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   626  // Turn the fn into a Go func (by taking its address) and call
   627  // cgocallback_gofunc.
   628  TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
   629  	LEAQ	fn+0(FP), AX
   630  	MOVQ	AX, 0(SP)
   631  	MOVQ	frame+8(FP), AX
   632  	MOVQ	AX, 8(SP)
   633  	MOVQ	framesize+16(FP), AX
   634  	MOVQ	AX, 16(SP)
   635  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   636  	CALL	AX
   637  	RET
   638  
   639  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   640  // See cgocall.go for more details.
   641  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
   642  	NO_LOCAL_POINTERS
   643  
   644  	// If g is nil, Go did not create the current thread.
   645  	// Call needm to obtain one m for temporary use.
   646  	// In this case, we're running on the thread stack, so there's
   647  	// lots of space, but the linker doesn't know. Hide the call from
   648  	// the linker analysis by using an indirect call through AX.
   649  	get_tls(CX)
   650  #ifdef GOOS_windows
   651  	MOVL	$0, BX
   652  	CMPQ	CX, $0
   653  	JEQ	2(PC)
   654  #endif
   655  	MOVQ	g(CX), BX
   656  	CMPQ	BX, $0
   657  	JEQ	needm
   658  	MOVQ	g_m(BX), BX
   659  	MOVQ	BX, R8 // holds oldm until end of function
   660  	JMP	havem
   661  needm:
   662  	MOVQ	$0, 0(SP)
   663  	MOVQ	$runtime·needm(SB), AX
   664  	CALL	AX
   665  	MOVQ	0(SP), R8
   666  	get_tls(CX)
   667  	MOVQ	g(CX), BX
   668  	MOVQ	g_m(BX), BX
   669  	
   670  	// Set m->sched.sp = SP, so that if a panic happens
   671  	// during the function we are about to execute, it will
   672  	// have a valid SP to run on the g0 stack.
   673  	// The next few lines (after the havem label)
   674  	// will save this SP onto the stack and then write
   675  	// the same SP back to m->sched.sp. That seems redundant,
   676  	// but if an unrecovered panic happens, unwindm will
   677  	// restore the g->sched.sp from the stack location
   678  	// and then systemstack will try to use it. If we don't set it here,
   679  	// that restored SP will be uninitialized (typically 0) and
   680  	// will not be usable.
   681  	MOVQ	m_g0(BX), SI
   682  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   683  
   684  havem:
   685  	// Now there's a valid m, and we're running on its m->g0.
   686  	// Save current m->g0->sched.sp on stack and then set it to SP.
   687  	// Save current sp in m->g0->sched.sp in preparation for
   688  	// switch back to m->curg stack.
   689  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   690  	MOVQ	m_g0(BX), SI
   691  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   692  	MOVQ	AX, 0(SP)
   693  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   694  
   695  	// Switch to m->curg stack and call runtime.cgocallbackg.
   696  	// Because we are taking over the execution of m->curg
   697  	// but *not* resuming what had been running, we need to
   698  	// save that information (m->curg->sched) so we can restore it.
   699  	// We can restore m->curg->sched.sp easily, because calling
   700  	// runtime.cgocallbackg leaves SP unchanged upon return.
   701  	// To save m->curg->sched.pc, we push it onto the stack.
   702  	// This has the added benefit that it looks to the traceback
   703  	// routine like cgocallbackg is going to return to that
   704  	// PC (because the frame we allocate below has the same
   705  	// size as cgocallback_gofunc's frame declared above)
   706  	// so that the traceback will seamlessly trace back into
   707  	// the earlier calls.
   708  	//
   709  	// In the new goroutine, 0(SP) holds the saved R8.
   710  	MOVQ	m_curg(BX), SI
   711  	MOVQ	SI, g(CX)
   712  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   713  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   714  	MOVQ	BX, -8(DI)
   715  	// Compute the size of the frame, including return PC and, if
   716  	// GOEXPERIMENT=framepointer, the saved based pointer
   717  	LEAQ	fv+0(FP), AX
   718  	SUBQ	SP, AX
   719  	SUBQ	AX, DI
   720  	MOVQ	DI, SP
   721  
   722  	MOVQ	R8, 0(SP)
   723  	CALL	runtime·cgocallbackg(SB)
   724  	MOVQ	0(SP), R8
   725  
   726  	// Compute the size of the frame again. FP and SP have
   727  	// completely different values here than they did above,
   728  	// but only their difference matters.
   729  	LEAQ	fv+0(FP), AX
   730  	SUBQ	SP, AX
   731  
   732  	// Restore g->sched (== m->curg->sched) from saved values.
   733  	get_tls(CX)
   734  	MOVQ	g(CX), SI
   735  	MOVQ	SP, DI
   736  	ADDQ	AX, DI
   737  	MOVQ	-8(DI), BX
   738  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   739  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   740  
   741  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   742  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   743  	// so we do not have to restore it.)
   744  	MOVQ	g(CX), BX
   745  	MOVQ	g_m(BX), BX
   746  	MOVQ	m_g0(BX), SI
   747  	MOVQ	SI, g(CX)
   748  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   749  	MOVQ	0(SP), AX
   750  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   751  	
   752  	// If the m on entry was nil, we called needm above to borrow an m
   753  	// for the duration of the call. Since the call is over, return it with dropm.
   754  	CMPQ	R8, $0
   755  	JNE 3(PC)
   756  	MOVQ	$runtime·dropm(SB), AX
   757  	CALL	AX
   758  
   759  	// Done!
   760  	RET
   761  
   762  // void setg(G*); set g. for use by needm.
   763  TEXT runtime·setg(SB), NOSPLIT, $0-8
   764  	MOVQ	gg+0(FP), BX
   765  #ifdef GOOS_windows
   766  	CMPQ	BX, $0
   767  	JNE	settls
   768  	MOVQ	$0, 0x28(GS)
   769  	RET
   770  settls:
   771  	MOVQ	g_m(BX), AX
   772  	LEAQ	m_tls(AX), AX
   773  	MOVQ	AX, 0x28(GS)
   774  #endif
   775  	get_tls(CX)
   776  	MOVQ	BX, g(CX)
   777  	RET
   778  
   779  // void setg_gcc(G*); set g called from gcc.
   780  TEXT setg_gcc<>(SB),NOSPLIT,$0
   781  	get_tls(AX)
   782  	MOVQ	DI, g(AX)
   783  	RET
   784  
   785  // check that SP is in range [g->stack.lo, g->stack.hi)
   786  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   787  	get_tls(CX)
   788  	MOVQ	g(CX), AX
   789  	CMPQ	(g_stack+stack_hi)(AX), SP
   790  	JHI	2(PC)
   791  	INT	$3
   792  	CMPQ	SP, (g_stack+stack_lo)(AX)
   793  	JHI	2(PC)
   794  	INT	$3
   795  	RET
   796  
   797  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   798  	MOVQ	argp+0(FP),AX		// addr of first arg
   799  	MOVQ	-8(AX),AX		// get calling pc
   800  	CMPQ	AX, runtime·stackBarrierPC(SB)
   801  	JNE	nobar
   802  	// Get original return PC.
   803  	CALL	runtime·nextBarrierPC(SB)
   804  	MOVQ	0(SP), AX
   805  nobar:
   806  	MOVQ	AX, ret+8(FP)
   807  	RET
   808  
   809  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   810  	MOVQ	argp+0(FP),AX		// addr of first arg
   811  	MOVQ	pc+8(FP), BX
   812  	MOVQ	-8(AX), CX
   813  	CMPQ	CX, runtime·stackBarrierPC(SB)
   814  	JEQ	setbar
   815  	MOVQ	BX, -8(AX)		// set calling pc
   816  	RET
   817  setbar:
   818  	// Set the stack barrier return PC.
   819  	MOVQ	BX, 0(SP)
   820  	CALL	runtime·setNextBarrierPC(SB)
   821  	RET
   822  
   823  TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   824  	MOVQ	argp+0(FP), AX
   825  	MOVQ	AX, ret+8(FP)
   826  	RET
   827  
   828  // func cputicks() int64
   829  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   830  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   831  	JNE	mfence
   832  	LFENCE
   833  	JMP	done
   834  mfence:
   835  	MFENCE
   836  done:
   837  	RDTSC
   838  	SHLQ	$32, DX
   839  	ADDQ	DX, AX
   840  	MOVQ	AX, ret+0(FP)
   841  	RET
   842  
   843  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   844  // redirects to memhash(p, h, size) using the size
   845  // stored in the closure.
   846  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   847  	GO_ARGS
   848  	NO_LOCAL_POINTERS
   849  	MOVQ	p+0(FP), AX
   850  	MOVQ	h+8(FP), BX
   851  	MOVQ	8(DX), CX
   852  	MOVQ	AX, 0(SP)
   853  	MOVQ	BX, 8(SP)
   854  	MOVQ	CX, 16(SP)
   855  	CALL	runtime·memhash(SB)
   856  	MOVQ	24(SP), AX
   857  	MOVQ	AX, ret+16(FP)
   858  	RET
   859  
   860  // hash function using AES hardware instructions
   861  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   862  	MOVQ	p+0(FP), AX	// ptr to data
   863  	MOVQ	s+16(FP), CX	// size
   864  	LEAQ	ret+24(FP), DX
   865  	JMP	runtime·aeshashbody(SB)
   866  
   867  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   868  	MOVQ	p+0(FP), AX	// ptr to string struct
   869  	MOVQ	8(AX), CX	// length of string
   870  	MOVQ	(AX), AX	// string data
   871  	LEAQ	ret+16(FP), DX
   872  	JMP	runtime·aeshashbody(SB)
   873  
   874  // AX: data
   875  // CX: length
   876  // DX: address to put return value
   877  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   878  	// Fill an SSE register with our seeds.
   879  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   880  	PINSRW	$4, CX, X0			// 16 bits of length
   881  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   882  	MOVO	X0, X1				// save unscrambled seed
   883  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   884  	AESENC	X0, X0				// scramble seed
   885  
   886  	CMPQ	CX, $16
   887  	JB	aes0to15
   888  	JE	aes16
   889  	CMPQ	CX, $32
   890  	JBE	aes17to32
   891  	CMPQ	CX, $64
   892  	JBE	aes33to64
   893  	CMPQ	CX, $128
   894  	JBE	aes65to128
   895  	JMP	aes129plus
   896  
   897  aes0to15:
   898  	TESTQ	CX, CX
   899  	JE	aes0
   900  
   901  	ADDQ	$16, AX
   902  	TESTW	$0xff0, AX
   903  	JE	endofpage
   904  
   905  	// 16 bytes loaded at this address won't cross
   906  	// a page boundary, so we can load it directly.
   907  	MOVOU	-16(AX), X1
   908  	ADDQ	CX, CX
   909  	MOVQ	$masks<>(SB), AX
   910  	PAND	(AX)(CX*8), X1
   911  final1:
   912  	AESENC	X0, X1	// scramble input, xor in seed
   913  	AESENC	X1, X1  // scramble combo 2 times
   914  	AESENC	X1, X1
   915  	MOVQ	X1, (DX)
   916  	RET
   917  
   918  endofpage:
   919  	// address ends in 1111xxxx. Might be up against
   920  	// a page boundary, so load ending at last byte.
   921  	// Then shift bytes down using pshufb.
   922  	MOVOU	-32(AX)(CX*1), X1
   923  	ADDQ	CX, CX
   924  	MOVQ	$shifts<>(SB), AX
   925  	PSHUFB	(AX)(CX*8), X1
   926  	JMP	final1
   927  
   928  aes0:
   929  	// Return scrambled input seed
   930  	AESENC	X0, X0
   931  	MOVQ	X0, (DX)
   932  	RET
   933  
   934  aes16:
   935  	MOVOU	(AX), X1
   936  	JMP	final1
   937  
   938  aes17to32:
   939  	// make second starting seed
   940  	PXOR	runtime·aeskeysched+16(SB), X1
   941  	AESENC	X1, X1
   942  	
   943  	// load data to be hashed
   944  	MOVOU	(AX), X2
   945  	MOVOU	-16(AX)(CX*1), X3
   946  
   947  	// scramble 3 times
   948  	AESENC	X0, X2
   949  	AESENC	X1, X3
   950  	AESENC	X2, X2
   951  	AESENC	X3, X3
   952  	AESENC	X2, X2
   953  	AESENC	X3, X3
   954  
   955  	// combine results
   956  	PXOR	X3, X2
   957  	MOVQ	X2, (DX)
   958  	RET
   959  
   960  aes33to64:
   961  	// make 3 more starting seeds
   962  	MOVO	X1, X2
   963  	MOVO	X1, X3
   964  	PXOR	runtime·aeskeysched+16(SB), X1
   965  	PXOR	runtime·aeskeysched+32(SB), X2
   966  	PXOR	runtime·aeskeysched+48(SB), X3
   967  	AESENC	X1, X1
   968  	AESENC	X2, X2
   969  	AESENC	X3, X3
   970  	
   971  	MOVOU	(AX), X4
   972  	MOVOU	16(AX), X5
   973  	MOVOU	-32(AX)(CX*1), X6
   974  	MOVOU	-16(AX)(CX*1), X7
   975  	
   976  	AESENC	X0, X4
   977  	AESENC	X1, X5
   978  	AESENC	X2, X6
   979  	AESENC	X3, X7
   980  	
   981  	AESENC	X4, X4
   982  	AESENC	X5, X5
   983  	AESENC	X6, X6
   984  	AESENC	X7, X7
   985  	
   986  	AESENC	X4, X4
   987  	AESENC	X5, X5
   988  	AESENC	X6, X6
   989  	AESENC	X7, X7
   990  
   991  	PXOR	X6, X4
   992  	PXOR	X7, X5
   993  	PXOR	X5, X4
   994  	MOVQ	X4, (DX)
   995  	RET
   996  
   997  aes65to128:
   998  	// make 7 more starting seeds
   999  	MOVO	X1, X2
  1000  	MOVO	X1, X3
  1001  	MOVO	X1, X4
  1002  	MOVO	X1, X5
  1003  	MOVO	X1, X6
  1004  	MOVO	X1, X7
  1005  	PXOR	runtime·aeskeysched+16(SB), X1
  1006  	PXOR	runtime·aeskeysched+32(SB), X2
  1007  	PXOR	runtime·aeskeysched+48(SB), X3
  1008  	PXOR	runtime·aeskeysched+64(SB), X4
  1009  	PXOR	runtime·aeskeysched+80(SB), X5
  1010  	PXOR	runtime·aeskeysched+96(SB), X6
  1011  	PXOR	runtime·aeskeysched+112(SB), X7
  1012  	AESENC	X1, X1
  1013  	AESENC	X2, X2
  1014  	AESENC	X3, X3
  1015  	AESENC	X4, X4
  1016  	AESENC	X5, X5
  1017  	AESENC	X6, X6
  1018  	AESENC	X7, X7
  1019  
  1020  	// load data
  1021  	MOVOU	(AX), X8
  1022  	MOVOU	16(AX), X9
  1023  	MOVOU	32(AX), X10
  1024  	MOVOU	48(AX), X11
  1025  	MOVOU	-64(AX)(CX*1), X12
  1026  	MOVOU	-48(AX)(CX*1), X13
  1027  	MOVOU	-32(AX)(CX*1), X14
  1028  	MOVOU	-16(AX)(CX*1), X15
  1029  
  1030  	// scramble data, xor in seed
  1031  	AESENC	X0, X8
  1032  	AESENC	X1, X9
  1033  	AESENC	X2, X10
  1034  	AESENC	X3, X11
  1035  	AESENC	X4, X12
  1036  	AESENC	X5, X13
  1037  	AESENC	X6, X14
  1038  	AESENC	X7, X15
  1039  
  1040  	// scramble twice
  1041  	AESENC	X8, X8
  1042  	AESENC	X9, X9
  1043  	AESENC	X10, X10
  1044  	AESENC	X11, X11
  1045  	AESENC	X12, X12
  1046  	AESENC	X13, X13
  1047  	AESENC	X14, X14
  1048  	AESENC	X15, X15
  1049  	
  1050  	AESENC	X8, X8
  1051  	AESENC	X9, X9
  1052  	AESENC	X10, X10
  1053  	AESENC	X11, X11
  1054  	AESENC	X12, X12
  1055  	AESENC	X13, X13
  1056  	AESENC	X14, X14
  1057  	AESENC	X15, X15
  1058  
  1059  	// combine results
  1060  	PXOR	X12, X8
  1061  	PXOR	X13, X9
  1062  	PXOR	X14, X10
  1063  	PXOR	X15, X11
  1064  	PXOR	X10, X8
  1065  	PXOR	X11, X9
  1066  	PXOR	X9, X8
  1067  	MOVQ	X8, (DX)
  1068  	RET
  1069  
  1070  aes129plus:
  1071  	// make 7 more starting seeds
  1072  	MOVO	X1, X2
  1073  	MOVO	X1, X3
  1074  	MOVO	X1, X4
  1075  	MOVO	X1, X5
  1076  	MOVO	X1, X6
  1077  	MOVO	X1, X7
  1078  	PXOR	runtime·aeskeysched+16(SB), X1
  1079  	PXOR	runtime·aeskeysched+32(SB), X2
  1080  	PXOR	runtime·aeskeysched+48(SB), X3
  1081  	PXOR	runtime·aeskeysched+64(SB), X4
  1082  	PXOR	runtime·aeskeysched+80(SB), X5
  1083  	PXOR	runtime·aeskeysched+96(SB), X6
  1084  	PXOR	runtime·aeskeysched+112(SB), X7
  1085  	AESENC	X1, X1
  1086  	AESENC	X2, X2
  1087  	AESENC	X3, X3
  1088  	AESENC	X4, X4
  1089  	AESENC	X5, X5
  1090  	AESENC	X6, X6
  1091  	AESENC	X7, X7
  1092  	
  1093  	// start with last (possibly overlapping) block
  1094  	MOVOU	-128(AX)(CX*1), X8
  1095  	MOVOU	-112(AX)(CX*1), X9
  1096  	MOVOU	-96(AX)(CX*1), X10
  1097  	MOVOU	-80(AX)(CX*1), X11
  1098  	MOVOU	-64(AX)(CX*1), X12
  1099  	MOVOU	-48(AX)(CX*1), X13
  1100  	MOVOU	-32(AX)(CX*1), X14
  1101  	MOVOU	-16(AX)(CX*1), X15
  1102  
  1103  	// scramble input once, xor in seed
  1104  	AESENC	X0, X8
  1105  	AESENC	X1, X9
  1106  	AESENC	X2, X10
  1107  	AESENC	X3, X11
  1108  	AESENC	X4, X12
  1109  	AESENC	X5, X13
  1110  	AESENC	X6, X14
  1111  	AESENC	X7, X15
  1112  	
  1113  	// compute number of remaining 128-byte blocks
  1114  	DECQ	CX
  1115  	SHRQ	$7, CX
  1116  	
  1117  aesloop:
  1118  	// scramble state, xor in a block
  1119  	MOVOU	(AX), X0
  1120  	MOVOU	16(AX), X1
  1121  	MOVOU	32(AX), X2
  1122  	MOVOU	48(AX), X3
  1123  	AESENC	X0, X8
  1124  	AESENC	X1, X9
  1125  	AESENC	X2, X10
  1126  	AESENC	X3, X11
  1127  	MOVOU	64(AX), X4
  1128  	MOVOU	80(AX), X5
  1129  	MOVOU	96(AX), X6
  1130  	MOVOU	112(AX), X7
  1131  	AESENC	X4, X12
  1132  	AESENC	X5, X13
  1133  	AESENC	X6, X14
  1134  	AESENC	X7, X15
  1135  
  1136  	// scramble state
  1137  	AESENC	X8, X8
  1138  	AESENC	X9, X9
  1139  	AESENC	X10, X10
  1140  	AESENC	X11, X11
  1141  	AESENC	X12, X12
  1142  	AESENC	X13, X13
  1143  	AESENC	X14, X14
  1144  	AESENC	X15, X15
  1145  
  1146  	ADDQ	$128, AX
  1147  	DECQ	CX
  1148  	JNE	aesloop
  1149  
  1150  	// 2 more scrambles to finish
  1151  	AESENC	X8, X8
  1152  	AESENC	X9, X9
  1153  	AESENC	X10, X10
  1154  	AESENC	X11, X11
  1155  	AESENC	X12, X12
  1156  	AESENC	X13, X13
  1157  	AESENC	X14, X14
  1158  	AESENC	X15, X15
  1159  	AESENC	X8, X8
  1160  	AESENC	X9, X9
  1161  	AESENC	X10, X10
  1162  	AESENC	X11, X11
  1163  	AESENC	X12, X12
  1164  	AESENC	X13, X13
  1165  	AESENC	X14, X14
  1166  	AESENC	X15, X15
  1167  
  1168  	PXOR	X12, X8
  1169  	PXOR	X13, X9
  1170  	PXOR	X14, X10
  1171  	PXOR	X15, X11
  1172  	PXOR	X10, X8
  1173  	PXOR	X11, X9
  1174  	PXOR	X9, X8
  1175  	MOVQ	X8, (DX)
  1176  	RET
  1177  	
  1178  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1179  	MOVQ	p+0(FP), AX	// ptr to data
  1180  	MOVQ	h+8(FP), X0	// seed
  1181  	PINSRD	$2, (AX), X0	// data
  1182  	AESENC	runtime·aeskeysched+0(SB), X0
  1183  	AESENC	runtime·aeskeysched+16(SB), X0
  1184  	AESENC	runtime·aeskeysched+32(SB), X0
  1185  	MOVQ	X0, ret+16(FP)
  1186  	RET
  1187  
  1188  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1189  	MOVQ	p+0(FP), AX	// ptr to data
  1190  	MOVQ	h+8(FP), X0	// seed
  1191  	PINSRQ	$1, (AX), X0	// data
  1192  	AESENC	runtime·aeskeysched+0(SB), X0
  1193  	AESENC	runtime·aeskeysched+16(SB), X0
  1194  	AESENC	runtime·aeskeysched+32(SB), X0
  1195  	MOVQ	X0, ret+16(FP)
  1196  	RET
  1197  
  1198  // simple mask to get rid of data in the high part of the register.
  1199  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1200  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1201  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1202  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1203  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1204  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1205  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1206  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1207  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1208  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1209  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1210  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1211  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1212  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1213  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1214  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1215  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1216  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1217  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1218  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1219  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1220  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1221  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1222  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1223  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1224  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1225  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1226  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1227  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1228  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1229  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1230  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1231  GLOBL masks<>(SB),RODATA,$256
  1232  
  1233  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1234  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1235  	MOVQ	$masks<>(SB), AX
  1236  	MOVQ	$shifts<>(SB), BX
  1237  	ORQ	BX, AX
  1238  	TESTQ	$15, AX
  1239  	SETEQ	ret+0(FP)
  1240  	RET
  1241  
  1242  // these are arguments to pshufb. They move data down from
  1243  // the high bytes of the register to the low bytes of the register.
  1244  // index is how many bytes to move.
  1245  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1246  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1247  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1248  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1249  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1250  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1251  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1252  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1253  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1254  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1255  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1256  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1257  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1258  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1259  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1260  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1261  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1262  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1263  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1264  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1265  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1266  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1267  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1268  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1269  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1270  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1271  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1272  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1273  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1274  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1275  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1276  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1277  GLOBL shifts<>(SB),RODATA,$256
  1278  
  1279  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1280  TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1281  	MOVQ	a+0(FP), SI
  1282  	MOVQ	b+8(FP), DI
  1283  	CMPQ	SI, DI
  1284  	JEQ	eq
  1285  	MOVQ	size+16(FP), BX
  1286  	LEAQ	ret+24(FP), AX
  1287  	JMP	runtime·memeqbody(SB)
  1288  eq:
  1289  	MOVB	$1, ret+24(FP)
  1290  	RET
  1291  
  1292  // memequal_varlen(a, b unsafe.Pointer) bool
  1293  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1294  	MOVQ	a+0(FP), SI
  1295  	MOVQ	b+8(FP), DI
  1296  	CMPQ	SI, DI
  1297  	JEQ	eq
  1298  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1299  	LEAQ	ret+16(FP), AX
  1300  	JMP	runtime·memeqbody(SB)
  1301  eq:
  1302  	MOVB	$1, ret+16(FP)
  1303  	RET
  1304  
  1305  // eqstring tests whether two strings are equal.
  1306  // The compiler guarantees that strings passed
  1307  // to eqstring have equal length.
  1308  // See runtime_test.go:eqstring_generic for
  1309  // equivalent Go code.
  1310  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1311  	MOVQ	s1str+0(FP), SI
  1312  	MOVQ	s2str+16(FP), DI
  1313  	CMPQ	SI, DI
  1314  	JEQ	eq
  1315  	MOVQ	s1len+8(FP), BX
  1316  	LEAQ	v+32(FP), AX
  1317  	JMP	runtime·memeqbody(SB)
  1318  eq:
  1319  	MOVB	$1, v+32(FP)
  1320  	RET
  1321  
  1322  // a in SI
  1323  // b in DI
  1324  // count in BX
  1325  // address of result byte in AX
  1326  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1327  	CMPQ	BX, $8
  1328  	JB	small
  1329  	CMPQ	BX, $64
  1330  	JB	bigloop
  1331  	CMPB    runtime·support_avx2(SB), $1
  1332  	JE	hugeloop_avx2
  1333  	
  1334  	// 64 bytes at a time using xmm registers
  1335  hugeloop:
  1336  	CMPQ	BX, $64
  1337  	JB	bigloop
  1338  	MOVOU	(SI), X0
  1339  	MOVOU	(DI), X1
  1340  	MOVOU	16(SI), X2
  1341  	MOVOU	16(DI), X3
  1342  	MOVOU	32(SI), X4
  1343  	MOVOU	32(DI), X5
  1344  	MOVOU	48(SI), X6
  1345  	MOVOU	48(DI), X7
  1346  	PCMPEQB	X1, X0
  1347  	PCMPEQB	X3, X2
  1348  	PCMPEQB	X5, X4
  1349  	PCMPEQB	X7, X6
  1350  	PAND	X2, X0
  1351  	PAND	X6, X4
  1352  	PAND	X4, X0
  1353  	PMOVMSKB X0, DX
  1354  	ADDQ	$64, SI
  1355  	ADDQ	$64, DI
  1356  	SUBQ	$64, BX
  1357  	CMPL	DX, $0xffff
  1358  	JEQ	hugeloop
  1359  	MOVB	$0, (AX)
  1360  	RET
  1361  
  1362  	// 64 bytes at a time using ymm registers
  1363  hugeloop_avx2:
  1364  	CMPQ	BX, $64
  1365  	JB	bigloop_avx2
  1366  	VMOVDQU	(SI), Y0
  1367  	VMOVDQU	(DI), Y1
  1368  	VMOVDQU	32(SI), Y2
  1369  	VMOVDQU	32(DI), Y3
  1370  	VPCMPEQB	Y1, Y0, Y4
  1371  	VPCMPEQB	Y2, Y3, Y5
  1372  	VPAND	Y4, Y5, Y6
  1373  	VPMOVMSKB Y6, DX
  1374  	ADDQ	$64, SI
  1375  	ADDQ	$64, DI
  1376  	SUBQ	$64, BX
  1377  	CMPL	DX, $0xffffffff
  1378  	JEQ	hugeloop_avx2
  1379  	VZEROUPPER
  1380  	MOVB	$0, (AX)
  1381  	RET
  1382  
  1383  bigloop_avx2:
  1384  	VZEROUPPER
  1385  
  1386  	// 8 bytes at a time using 64-bit register
  1387  bigloop:
  1388  	CMPQ	BX, $8
  1389  	JBE	leftover
  1390  	MOVQ	(SI), CX
  1391  	MOVQ	(DI), DX
  1392  	ADDQ	$8, SI
  1393  	ADDQ	$8, DI
  1394  	SUBQ	$8, BX
  1395  	CMPQ	CX, DX
  1396  	JEQ	bigloop
  1397  	MOVB	$0, (AX)
  1398  	RET
  1399  
  1400  	// remaining 0-8 bytes
  1401  leftover:
  1402  	MOVQ	-8(SI)(BX*1), CX
  1403  	MOVQ	-8(DI)(BX*1), DX
  1404  	CMPQ	CX, DX
  1405  	SETEQ	(AX)
  1406  	RET
  1407  
  1408  small:
  1409  	CMPQ	BX, $0
  1410  	JEQ	equal
  1411  
  1412  	LEAQ	0(BX*8), CX
  1413  	NEGQ	CX
  1414  
  1415  	CMPB	SI, $0xf8
  1416  	JA	si_high
  1417  
  1418  	// load at SI won't cross a page boundary.
  1419  	MOVQ	(SI), SI
  1420  	JMP	si_finish
  1421  si_high:
  1422  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1423  	MOVQ	-8(SI)(BX*1), SI
  1424  	SHRQ	CX, SI
  1425  si_finish:
  1426  
  1427  	// same for DI.
  1428  	CMPB	DI, $0xf8
  1429  	JA	di_high
  1430  	MOVQ	(DI), DI
  1431  	JMP	di_finish
  1432  di_high:
  1433  	MOVQ	-8(DI)(BX*1), DI
  1434  	SHRQ	CX, DI
  1435  di_finish:
  1436  
  1437  	SUBQ	SI, DI
  1438  	SHLQ	CX, DI
  1439  equal:
  1440  	SETEQ	(AX)
  1441  	RET
  1442  
  1443  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1444  	MOVQ	s1_base+0(FP), SI
  1445  	MOVQ	s1_len+8(FP), BX
  1446  	MOVQ	s2_base+16(FP), DI
  1447  	MOVQ	s2_len+24(FP), DX
  1448  	LEAQ	ret+32(FP), R9
  1449  	JMP	runtime·cmpbody(SB)
  1450  
  1451  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1452  	MOVQ	s1+0(FP), SI
  1453  	MOVQ	s1+8(FP), BX
  1454  	MOVQ	s2+24(FP), DI
  1455  	MOVQ	s2+32(FP), DX
  1456  	LEAQ	res+48(FP), R9
  1457  	JMP	runtime·cmpbody(SB)
  1458  
  1459  // input:
  1460  //   SI = a
  1461  //   DI = b
  1462  //   BX = alen
  1463  //   DX = blen
  1464  //   R9 = address of output word (stores -1/0/1 here)
  1465  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1466  	CMPQ	SI, DI
  1467  	JEQ	allsame
  1468  	CMPQ	BX, DX
  1469  	MOVQ	DX, R8
  1470  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1471  	CMPQ	R8, $8
  1472  	JB	small
  1473  
  1474  	CMPQ	R8, $63
  1475  	JBE	loop
  1476  	CMPB    runtime·support_avx2(SB), $1
  1477  	JEQ     big_loop_avx2
  1478  	JMP	big_loop
  1479  loop:
  1480  	CMPQ	R8, $16
  1481  	JBE	_0through16
  1482  	MOVOU	(SI), X0
  1483  	MOVOU	(DI), X1
  1484  	PCMPEQB X0, X1
  1485  	PMOVMSKB X1, AX
  1486  	XORQ	$0xffff, AX	// convert EQ to NE
  1487  	JNE	diff16	// branch if at least one byte is not equal
  1488  	ADDQ	$16, SI
  1489  	ADDQ	$16, DI
  1490  	SUBQ	$16, R8
  1491  	JMP	loop
  1492  	
  1493  diff64:
  1494  	ADDQ	$48, SI
  1495  	ADDQ	$48, DI
  1496  	JMP	diff16
  1497  diff48:
  1498  	ADDQ	$32, SI
  1499  	ADDQ	$32, DI
  1500  	JMP	diff16
  1501  diff32:
  1502  	ADDQ	$16, SI
  1503  	ADDQ	$16, DI
  1504  	// AX = bit mask of differences
  1505  diff16:
  1506  	BSFQ	AX, BX	// index of first byte that differs
  1507  	XORQ	AX, AX
  1508  	MOVB	(SI)(BX*1), CX
  1509  	CMPB	CX, (DI)(BX*1)
  1510  	SETHI	AX
  1511  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1512  	MOVQ	AX, (R9)
  1513  	RET
  1514  
  1515  	// 0 through 16 bytes left, alen>=8, blen>=8
  1516  _0through16:
  1517  	CMPQ	R8, $8
  1518  	JBE	_0through8
  1519  	MOVQ	(SI), AX
  1520  	MOVQ	(DI), CX
  1521  	CMPQ	AX, CX
  1522  	JNE	diff8
  1523  _0through8:
  1524  	MOVQ	-8(SI)(R8*1), AX
  1525  	MOVQ	-8(DI)(R8*1), CX
  1526  	CMPQ	AX, CX
  1527  	JEQ	allsame
  1528  
  1529  	// AX and CX contain parts of a and b that differ.
  1530  diff8:
  1531  	BSWAPQ	AX	// reverse order of bytes
  1532  	BSWAPQ	CX
  1533  	XORQ	AX, CX
  1534  	BSRQ	CX, CX	// index of highest bit difference
  1535  	SHRQ	CX, AX	// move a's bit to bottom
  1536  	ANDQ	$1, AX	// mask bit
  1537  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1538  	MOVQ	AX, (R9)
  1539  	RET
  1540  
  1541  	// 0-7 bytes in common
  1542  small:
  1543  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1544  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1545  	JEQ	allsame
  1546  
  1547  	// load bytes of a into high bytes of AX
  1548  	CMPB	SI, $0xf8
  1549  	JA	si_high
  1550  	MOVQ	(SI), SI
  1551  	JMP	si_finish
  1552  si_high:
  1553  	MOVQ	-8(SI)(R8*1), SI
  1554  	SHRQ	CX, SI
  1555  si_finish:
  1556  	SHLQ	CX, SI
  1557  
  1558  	// load bytes of b in to high bytes of BX
  1559  	CMPB	DI, $0xf8
  1560  	JA	di_high
  1561  	MOVQ	(DI), DI
  1562  	JMP	di_finish
  1563  di_high:
  1564  	MOVQ	-8(DI)(R8*1), DI
  1565  	SHRQ	CX, DI
  1566  di_finish:
  1567  	SHLQ	CX, DI
  1568  
  1569  	BSWAPQ	SI	// reverse order of bytes
  1570  	BSWAPQ	DI
  1571  	XORQ	SI, DI	// find bit differences
  1572  	JEQ	allsame
  1573  	BSRQ	DI, CX	// index of highest bit difference
  1574  	SHRQ	CX, SI	// move a's bit to bottom
  1575  	ANDQ	$1, SI	// mask bit
  1576  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1577  	MOVQ	AX, (R9)
  1578  	RET
  1579  
  1580  allsame:
  1581  	XORQ	AX, AX
  1582  	XORQ	CX, CX
  1583  	CMPQ	BX, DX
  1584  	SETGT	AX	// 1 if alen > blen
  1585  	SETEQ	CX	// 1 if alen == blen
  1586  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1587  	MOVQ	AX, (R9)
  1588  	RET
  1589  
  1590  	// this works for >= 64 bytes of data.
  1591  big_loop:
  1592  	MOVOU	(SI), X0
  1593  	MOVOU	(DI), X1
  1594  	PCMPEQB X0, X1
  1595  	PMOVMSKB X1, AX
  1596  	XORQ	$0xffff, AX
  1597  	JNE	diff16
  1598  
  1599  	MOVOU	16(SI), X0
  1600  	MOVOU	16(DI), X1
  1601  	PCMPEQB X0, X1
  1602  	PMOVMSKB X1, AX
  1603  	XORQ	$0xffff, AX
  1604  	JNE	diff32
  1605  
  1606  	MOVOU	32(SI), X0
  1607  	MOVOU	32(DI), X1
  1608  	PCMPEQB X0, X1
  1609  	PMOVMSKB X1, AX
  1610  	XORQ	$0xffff, AX
  1611  	JNE	diff48
  1612  
  1613  	MOVOU	48(SI), X0
  1614  	MOVOU	48(DI), X1
  1615  	PCMPEQB X0, X1
  1616  	PMOVMSKB X1, AX
  1617  	XORQ	$0xffff, AX
  1618  	JNE	diff64
  1619  
  1620  	ADDQ	$64, SI
  1621  	ADDQ	$64, DI
  1622  	SUBQ	$64, R8
  1623  	CMPQ	R8, $64
  1624  	JBE	loop
  1625  	JMP	big_loop
  1626  
  1627  	// Compare 64-bytes per loop iteration.
  1628  	// Loop is unrolled and uses AVX2.
  1629  big_loop_avx2:
  1630  	VMOVDQU	(SI), Y2
  1631  	VMOVDQU	(DI), Y3
  1632  	VMOVDQU	32(SI), Y4
  1633  	VMOVDQU	32(DI), Y5
  1634  	VPCMPEQB Y2, Y3, Y0
  1635  	VPMOVMSKB Y0, AX
  1636  	XORL	$0xffffffff, AX
  1637  	JNE	diff32_avx2
  1638  	VPCMPEQB Y4, Y5, Y6
  1639  	VPMOVMSKB Y6, AX
  1640  	XORL	$0xffffffff, AX
  1641  	JNE	diff64_avx2
  1642  
  1643  	ADDQ	$64, SI
  1644  	ADDQ	$64, DI
  1645  	SUBQ	$64, R8
  1646  	CMPQ	R8, $64
  1647  	JB	big_loop_avx2_exit
  1648  	JMP	big_loop_avx2
  1649  
  1650  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1651  diff32_avx2:
  1652  	VZEROUPPER
  1653  	JMP diff16
  1654  
  1655  	// Same as diff32_avx2, but for last 32 bytes.
  1656  diff64_avx2:
  1657  	VZEROUPPER
  1658  	JMP diff48
  1659  
  1660  	// For <64 bytes remainder jump to normal loop.
  1661  big_loop_avx2_exit:
  1662  	VZEROUPPER
  1663  	JMP loop
  1664  
  1665  
  1666  // TODO: Also use this in bytes.Index
  1667  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1668  	MOVQ s+0(FP), DI
  1669  	MOVQ s_len+8(FP), CX
  1670  	MOVQ c+16(FP), AX
  1671  	MOVQ c_len+24(FP), BX
  1672  	CMPQ BX, CX
  1673  	JA fail
  1674  	CMPQ BX, $2
  1675  	JA   _3_or_more
  1676  	MOVW (AX), AX
  1677  	LEAQ -1(DI)(CX*1), CX
  1678  loop2:
  1679  	MOVW (DI), SI
  1680  	CMPW SI,AX
  1681  	JZ success
  1682  	ADDQ $1,DI
  1683  	CMPQ DI,CX
  1684  	JB loop2
  1685  	JMP fail
  1686  _3_or_more:
  1687  	CMPQ BX, $3
  1688  	JA   _4_or_more
  1689  	MOVW 1(AX), DX
  1690  	MOVW (AX), AX
  1691  	LEAQ -2(DI)(CX*1), CX
  1692  loop3:
  1693  	MOVW (DI), SI
  1694  	CMPW SI,AX
  1695  	JZ   partial_success3
  1696  	ADDQ $1,DI
  1697  	CMPQ DI,CX
  1698  	JB loop3
  1699  	JMP fail
  1700  partial_success3:
  1701  	MOVW 1(DI), SI
  1702  	CMPW SI,DX
  1703  	JZ success
  1704  	ADDQ $1,DI
  1705  	CMPQ DI,CX
  1706  	JB loop3
  1707  	JMP fail
  1708  _4_or_more:
  1709  	CMPQ BX, $4
  1710  	JA   _5_or_more
  1711  	MOVL (AX), AX
  1712  	LEAQ -3(DI)(CX*1), CX
  1713  loop4:
  1714  	MOVL (DI), SI
  1715  	CMPL SI,AX
  1716  	JZ   success
  1717  	ADDQ $1,DI
  1718  	CMPQ DI,CX
  1719  	JB loop4
  1720  	JMP fail
  1721  _5_or_more:
  1722  	CMPQ BX, $7
  1723  	JA   _8_or_more
  1724  	LEAQ 1(DI)(CX*1), CX
  1725  	SUBQ BX, CX
  1726  	MOVL -4(AX)(BX*1), DX
  1727  	MOVL (AX), AX
  1728  loop5to7:
  1729  	MOVL (DI), SI
  1730  	CMPL SI,AX
  1731  	JZ   partial_success5to7
  1732  	ADDQ $1,DI
  1733  	CMPQ DI,CX
  1734  	JB loop5to7
  1735  	JMP fail
  1736  partial_success5to7:
  1737  	MOVL -4(BX)(DI*1), SI
  1738  	CMPL SI,DX
  1739  	JZ success
  1740  	ADDQ $1,DI
  1741  	CMPQ DI,CX
  1742  	JB loop5to7
  1743  	JMP fail
  1744  _8_or_more:
  1745  	CMPQ BX, $8
  1746  	JA   _9_or_more
  1747  	MOVQ (AX), AX
  1748  	LEAQ -7(DI)(CX*1), CX
  1749  loop8:
  1750  	MOVQ (DI), SI
  1751  	CMPQ SI,AX
  1752  	JZ   success
  1753  	ADDQ $1,DI
  1754  	CMPQ DI,CX
  1755  	JB loop8
  1756  	JMP fail
  1757  _9_or_more:
  1758  	CMPQ BX, $16
  1759  	JA   _16_or_more
  1760  	LEAQ 1(DI)(CX*1), CX
  1761  	SUBQ BX, CX
  1762  	MOVQ -8(AX)(BX*1), DX
  1763  	MOVQ (AX), AX
  1764  loop9to15:
  1765  	MOVQ (DI), SI
  1766  	CMPQ SI,AX
  1767  	JZ   partial_success9to15
  1768  	ADDQ $1,DI
  1769  	CMPQ DI,CX
  1770  	JB loop9to15
  1771  	JMP fail
  1772  partial_success9to15:
  1773  	MOVQ -8(BX)(DI*1), SI
  1774  	CMPQ SI,DX
  1775  	JZ success
  1776  	ADDQ $1,DI
  1777  	CMPQ DI,CX
  1778  	JB loop9to15
  1779  	JMP fail
  1780  _16_or_more:
  1781  	CMPQ BX, $16
  1782  	JA   _17_to_31
  1783  	MOVOU (AX), X1
  1784  	LEAQ -15(DI)(CX*1), CX
  1785  loop16:
  1786  	MOVOU (DI), X2
  1787  	PCMPEQB X1, X2
  1788  	PMOVMSKB X2, SI
  1789  	CMPQ  SI, $0xffff
  1790  	JE   success
  1791  	ADDQ $1,DI
  1792  	CMPQ DI,CX
  1793  	JB loop16
  1794  	JMP fail
  1795  _17_to_31:
  1796  	LEAQ 1(DI)(CX*1), CX
  1797  	SUBQ BX, CX
  1798  	MOVOU -16(AX)(BX*1), X0
  1799  	MOVOU (AX), X1
  1800  loop17to31:
  1801  	MOVOU (DI), X2
  1802  	PCMPEQB X1,X2
  1803  	PMOVMSKB X2, SI
  1804  	CMPQ  SI, $0xffff
  1805  	JE   partial_success17to31
  1806  	ADDQ $1,DI
  1807  	CMPQ DI,CX
  1808  	JB loop17to31
  1809  	JMP fail
  1810  partial_success17to31:
  1811  	MOVOU -16(BX)(DI*1), X3
  1812  	PCMPEQB X0, X3
  1813  	PMOVMSKB X3, SI
  1814  	CMPQ  SI, $0xffff
  1815  	JE success
  1816  	ADDQ $1,DI
  1817  	CMPQ DI,CX
  1818  	JB loop17to31
  1819  fail:
  1820  	MOVQ $-1, ret+32(FP)
  1821  	RET
  1822  success:
  1823  	SUBQ s+0(FP), DI
  1824  	MOVQ DI, ret+32(FP)
  1825  	RET
  1826  
  1827  
  1828  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1829  	MOVQ s+0(FP), SI
  1830  	MOVQ s_len+8(FP), BX
  1831  	MOVB c+24(FP), AL
  1832  	LEAQ ret+32(FP), R8
  1833  	JMP  runtime·indexbytebody(SB)
  1834  
  1835  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1836  	MOVQ s+0(FP), SI
  1837  	MOVQ s_len+8(FP), BX
  1838  	MOVB c+16(FP), AL
  1839  	LEAQ ret+24(FP), R8
  1840  	JMP  runtime·indexbytebody(SB)
  1841  
  1842  // input:
  1843  //   SI: data
  1844  //   BX: data len
  1845  //   AL: byte sought
  1846  //   R8: address to put result
  1847  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1848  	// Shuffle X0 around so that each byte contains
  1849  	// the character we're looking for.
  1850  	MOVD AX, X0
  1851  	PUNPCKLBW X0, X0
  1852  	PUNPCKLBW X0, X0
  1853  	PSHUFL $0, X0, X0
  1854  	
  1855  	CMPQ BX, $16
  1856  	JLT small
  1857  
  1858  	MOVQ SI, DI
  1859  
  1860  	CMPQ BX, $32
  1861  	JA avx2
  1862  sse:
  1863  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  1864  	JMP	sseloopentry
  1865  	
  1866  sseloop:
  1867  	// Move the next 16-byte chunk of the data into X1.
  1868  	MOVOU	(DI), X1
  1869  	// Compare bytes in X0 to X1.
  1870  	PCMPEQB	X0, X1
  1871  	// Take the top bit of each byte in X1 and put the result in DX.
  1872  	PMOVMSKB X1, DX
  1873  	// Find first set bit, if any.
  1874  	BSFL	DX, DX
  1875  	JNZ	ssesuccess
  1876  	// Advance to next block.
  1877  	ADDQ	$16, DI
  1878  sseloopentry:
  1879  	CMPQ	DI, AX
  1880  	JB	sseloop
  1881  
  1882  	// Search the last 16-byte chunk. This chunk may overlap with the
  1883  	// chunks we've already searched, but that's ok.
  1884  	MOVQ	AX, DI
  1885  	MOVOU	(AX), X1
  1886  	PCMPEQB	X0, X1
  1887  	PMOVMSKB X1, DX
  1888  	BSFL	DX, DX
  1889  	JNZ	ssesuccess
  1890  
  1891  failure:
  1892  	MOVQ $-1, (R8)
  1893  	RET
  1894  
  1895  // We've found a chunk containing the byte.
  1896  // The chunk was loaded from DI.
  1897  // The index of the matching byte in the chunk is DX.
  1898  // The start of the data is SI.
  1899  ssesuccess:
  1900  	SUBQ SI, DI	// Compute offset of chunk within data.
  1901  	ADDQ DX, DI	// Add offset of byte within chunk.
  1902  	MOVQ DI, (R8)
  1903  	RET
  1904  
  1905  // handle for lengths < 16
  1906  small:
  1907  	TESTQ	BX, BX
  1908  	JEQ	failure
  1909  
  1910  	// Check if we'll load across a page boundary.
  1911  	LEAQ	16(SI), AX
  1912  	TESTW	$0xff0, AX
  1913  	JEQ	endofpage
  1914  
  1915  	MOVOU	(SI), X1 // Load data
  1916  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  1917  	PMOVMSKB X1, DX	// Move result bits to integer register.
  1918  	BSFL	DX, DX	// Find first set bit.
  1919  	JZ	failure	// No set bit, failure.
  1920  	CMPL	DX, BX
  1921  	JAE	failure	// Match is past end of data.
  1922  	MOVQ	DX, (R8)
  1923  	RET
  1924  
  1925  endofpage:
  1926  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  1927  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  1928  	PMOVMSKB X1, DX	// Move result bits to integer register.
  1929  	MOVL	BX, CX
  1930  	SHLL	CX, DX
  1931  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
  1932  	BSFL	DX, DX	// Find first set bit.
  1933  	JZ	failure	// No set bit, failure.
  1934  	MOVQ	DX, (R8)
  1935  	RET
  1936  
  1937  avx2:
  1938  	CMPB   runtime·support_avx2(SB), $1
  1939  	JNE sse
  1940  	MOVD AX, X0
  1941  	LEAQ -32(SI)(BX*1), R11
  1942  	VPBROADCASTB  X0, Y1
  1943  avx2_loop:
  1944  	VMOVDQU (DI), Y2
  1945  	VPCMPEQB Y1, Y2, Y3
  1946  	VPTEST Y3, Y3
  1947  	JNZ avx2success
  1948  	ADDQ $32, DI
  1949  	CMPQ DI, R11
  1950  	JLT avx2_loop
  1951  	MOVQ R11, DI
  1952  	VMOVDQU (DI), Y2
  1953  	VPCMPEQB Y1, Y2, Y3
  1954  	VPTEST Y3, Y3
  1955  	JNZ avx2success
  1956  	VZEROUPPER
  1957  	MOVQ $-1, (R8)
  1958  	RET
  1959  
  1960  avx2success:
  1961  	VPMOVMSKB Y3, DX
  1962  	BSFL DX, DX
  1963  	SUBQ SI, DI
  1964  	ADDQ DI, DX
  1965  	MOVQ DX, (R8)
  1966  	VZEROUPPER
  1967  	RET
  1968  
  1969  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  1970  	MOVQ	a_len+8(FP), BX
  1971  	MOVQ	b_len+32(FP), CX
  1972  	CMPQ	BX, CX
  1973  	JNE	eqret
  1974  	MOVQ	a+0(FP), SI
  1975  	MOVQ	b+24(FP), DI
  1976  	LEAQ	ret+48(FP), AX
  1977  	JMP	runtime·memeqbody(SB)
  1978  eqret:
  1979  	MOVB	$0, ret+48(FP)
  1980  	RET
  1981  
  1982  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1983  	get_tls(CX)
  1984  	MOVQ	g(CX), AX
  1985  	MOVQ	g_m(AX), AX
  1986  	MOVL	m_fastrand(AX), DX
  1987  	ADDL	DX, DX
  1988  	MOVL	DX, BX
  1989  	XORL	$0x88888eef, DX
  1990  	CMOVLMI	BX, DX
  1991  	MOVL	DX, m_fastrand(AX)
  1992  	MOVL	DX, ret+0(FP)
  1993  	RET
  1994  
  1995  TEXT runtime·return0(SB), NOSPLIT, $0
  1996  	MOVL	$0, AX
  1997  	RET
  1998  
  1999  
  2000  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2001  // Must obey the gcc calling convention.
  2002  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2003  	get_tls(CX)
  2004  	MOVQ	g(CX), AX
  2005  	MOVQ	g_m(AX), AX
  2006  	MOVQ	m_curg(AX), AX
  2007  	MOVQ	(g_stack+stack_hi)(AX), AX
  2008  	RET
  2009  
  2010  // The top-most function running on a goroutine
  2011  // returns to goexit+PCQuantum.
  2012  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2013  	BYTE	$0x90	// NOP
  2014  	CALL	runtime·goexit1(SB)	// does not return
  2015  	// traceback from goexit1 must hit code range of goexit
  2016  	BYTE	$0x90	// NOP
  2017  
  2018  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2019  	MOVQ	addr+0(FP), AX
  2020  	PREFETCHT0	(AX)
  2021  	RET
  2022  
  2023  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2024  	MOVQ	addr+0(FP), AX
  2025  	PREFETCHT1	(AX)
  2026  	RET
  2027  
  2028  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2029  	MOVQ	addr+0(FP), AX
  2030  	PREFETCHT2	(AX)
  2031  	RET
  2032  
  2033  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2034  	MOVQ	addr+0(FP), AX
  2035  	PREFETCHNTA	(AX)
  2036  	RET
  2037  
  2038  // This is called from .init_array and follows the platform, not Go, ABI.
  2039  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2040  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2041  	MOVQ	runtime·lastmoduledatap(SB), AX
  2042  	MOVQ	DI, moduledata_next(AX)
  2043  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2044  	POPQ	R15
  2045  	RET