github.com/dannin/go@v0.0.0-20161031215817-d35dfd405eaa/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	MOVQ	AX, SI
    32  	CMPQ	AX, $0
    33  	JE	nocpuinfo
    34  
    35  	// Figure out how to serialize RDTSC.
    36  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37  	// Don't know about the rest, so let's do MFENCE.
    38  	CMPL	BX, $0x756E6547  // "Genu"
    39  	JNE	notintel
    40  	CMPL	DX, $0x49656E69  // "ineI"
    41  	JNE	notintel
    42  	CMPL	CX, $0x6C65746E  // "ntel"
    43  	JNE	notintel
    44  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    45  notintel:
    46  
    47  	// Load EAX=1 cpuid flags
    48  	MOVQ	$1, AX
    49  	CPUID
    50  	MOVL	CX, runtime·cpuid_ecx(SB)
    51  	MOVL	DX, runtime·cpuid_edx(SB)
    52  
    53  	// Load EAX=7/ECX=0 cpuid flags
    54  	CMPQ	SI, $7
    55  	JLT	no7
    56  	MOVL	$7, AX
    57  	MOVL	$0, CX
    58  	CPUID
    59  	MOVL	BX, runtime·cpuid_ebx7(SB)
    60  no7:
    61  	// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
    62  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
    63  	// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
    64  	MOVL	runtime·cpuid_ecx(SB), CX
    65  	ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
    66  	CMPL    CX, $0x18000000
    67  	JNE     noavx
    68  	MOVL    $0, CX
    69  	// For XGETBV, OSXSAVE bit is required and sufficient
    70  	XGETBV
    71  	ANDL    $6, AX
    72  	CMPL    AX, $6 // Check for OS support of YMM registers
    73  	JNE     noavx
    74  	MOVB    $1, runtime·support_avx(SB)
    75  	TESTL   $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
    76  	JEQ     noavx2
    77  	MOVB    $1, runtime·support_avx2(SB)
    78  	JMP     nocpuinfo
    79  noavx:
    80  	MOVB    $0, runtime·support_avx(SB)
    81  noavx2:
    82  	MOVB    $0, runtime·support_avx2(SB)
    83  nocpuinfo:	
    84  	
    85  	// if there is an _cgo_init, call it.
    86  	MOVQ	_cgo_init(SB), AX
    87  	TESTQ	AX, AX
    88  	JZ	needtls
    89  	// g0 already in DI
    90  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    91  	MOVQ	$setg_gcc<>(SB), SI
    92  	CALL	AX
    93  
    94  	// update stackguard after _cgo_init
    95  	MOVQ	$runtime·g0(SB), CX
    96  	MOVQ	(g_stack+stack_lo)(CX), AX
    97  	ADDQ	$const__StackGuard, AX
    98  	MOVQ	AX, g_stackguard0(CX)
    99  	MOVQ	AX, g_stackguard1(CX)
   100  
   101  #ifndef GOOS_windows
   102  	JMP ok
   103  #endif
   104  needtls:
   105  #ifdef GOOS_plan9
   106  	// skip TLS setup on Plan 9
   107  	JMP ok
   108  #endif
   109  #ifdef GOOS_solaris
   110  	// skip TLS setup on Solaris
   111  	JMP ok
   112  #endif
   113  
   114  	LEAQ	runtime·m0+m_tls(SB), DI
   115  	CALL	runtime·settls(SB)
   116  
   117  	// store through it, to make sure it works
   118  	get_tls(BX)
   119  	MOVQ	$0x123, g(BX)
   120  	MOVQ	runtime·m0+m_tls(SB), AX
   121  	CMPQ	AX, $0x123
   122  	JEQ 2(PC)
   123  	MOVL	AX, 0	// abort
   124  ok:
   125  	// set the per-goroutine and per-mach "registers"
   126  	get_tls(BX)
   127  	LEAQ	runtime·g0(SB), CX
   128  	MOVQ	CX, g(BX)
   129  	LEAQ	runtime·m0(SB), AX
   130  
   131  	// save m->g0 = g0
   132  	MOVQ	CX, m_g0(AX)
   133  	// save m0 to g0->m
   134  	MOVQ	AX, g_m(CX)
   135  
   136  	CLD				// convention is D is always left cleared
   137  	CALL	runtime·check(SB)
   138  
   139  	MOVL	16(SP), AX		// copy argc
   140  	MOVL	AX, 0(SP)
   141  	MOVQ	24(SP), AX		// copy argv
   142  	MOVQ	AX, 8(SP)
   143  	CALL	runtime·args(SB)
   144  	CALL	runtime·osinit(SB)
   145  	CALL	runtime·schedinit(SB)
   146  
   147  	// create a new goroutine to start program
   148  	MOVQ	$runtime·mainPC(SB), AX		// entry
   149  	PUSHQ	AX
   150  	PUSHQ	$0			// arg size
   151  	CALL	runtime·newproc(SB)
   152  	POPQ	AX
   153  	POPQ	AX
   154  
   155  	// start this M
   156  	CALL	runtime·mstart(SB)
   157  
   158  	MOVL	$0xf1, 0xf1  // crash
   159  	RET
   160  
   161  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   162  GLOBL	runtime·mainPC(SB),RODATA,$8
   163  
   164  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   165  	BYTE	$0xcc
   166  	RET
   167  
   168  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   169  	// No per-thread init.
   170  	RET
   171  
   172  /*
   173   *  go-routine
   174   */
   175  
   176  // void gosave(Gobuf*)
   177  // save state in Gobuf; setjmp
   178  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   179  	MOVQ	buf+0(FP), AX		// gobuf
   180  	LEAQ	buf+0(FP), BX		// caller's SP
   181  	MOVQ	BX, gobuf_sp(AX)
   182  	MOVQ	0(SP), BX		// caller's PC
   183  	MOVQ	BX, gobuf_pc(AX)
   184  	MOVQ	$0, gobuf_ret(AX)
   185  	MOVQ	BP, gobuf_bp(AX)
   186  	// Assert ctxt is zero. See func save.
   187  	MOVQ	gobuf_ctxt(AX), BX
   188  	TESTQ	BX, BX
   189  	JZ	2(PC)
   190  	CALL	runtime·badctxt(SB)
   191  	get_tls(CX)
   192  	MOVQ	g(CX), BX
   193  	MOVQ	BX, gobuf_g(AX)
   194  	RET
   195  
   196  // void gogo(Gobuf*)
   197  // restore state from Gobuf; longjmp
   198  TEXT runtime·gogo(SB), NOSPLIT, $16-8
   199  	MOVQ	buf+0(FP), BX		// gobuf
   200  
   201  	// If ctxt is not nil, invoke deletion barrier before overwriting.
   202  	MOVQ	gobuf_ctxt(BX), AX
   203  	TESTQ	AX, AX
   204  	JZ	nilctxt
   205  	LEAQ	gobuf_ctxt(BX), AX
   206  	MOVQ	AX, 0(SP)
   207  	MOVQ	$0, 8(SP)
   208  	CALL	runtime·writebarrierptr_prewrite(SB)
   209  	MOVQ	buf+0(FP), BX
   210  
   211  nilctxt:
   212  	MOVQ	gobuf_g(BX), DX
   213  	MOVQ	0(DX), CX		// make sure g != nil
   214  	get_tls(CX)
   215  	MOVQ	DX, g(CX)
   216  	MOVQ	gobuf_sp(BX), SP	// restore SP
   217  	MOVQ	gobuf_ret(BX), AX
   218  	MOVQ	gobuf_ctxt(BX), DX
   219  	MOVQ	gobuf_bp(BX), BP
   220  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   221  	MOVQ	$0, gobuf_ret(BX)
   222  	MOVQ	$0, gobuf_ctxt(BX)
   223  	MOVQ	$0, gobuf_bp(BX)
   224  	MOVQ	gobuf_pc(BX), BX
   225  	JMP	BX
   226  
   227  // func mcall(fn func(*g))
   228  // Switch to m->g0's stack, call fn(g).
   229  // Fn must never return. It should gogo(&g->sched)
   230  // to keep running g.
   231  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   232  	MOVQ	fn+0(FP), DI
   233  	
   234  	get_tls(CX)
   235  	MOVQ	g(CX), AX	// save state in g->sched
   236  	MOVQ	0(SP), BX	// caller's PC
   237  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   238  	LEAQ	fn+0(FP), BX	// caller's SP
   239  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   240  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   241  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   242  
   243  	// switch to m->g0 & its stack, call fn
   244  	MOVQ	g(CX), BX
   245  	MOVQ	g_m(BX), BX
   246  	MOVQ	m_g0(BX), SI
   247  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   248  	JNE	3(PC)
   249  	MOVQ	$runtime·badmcall(SB), AX
   250  	JMP	AX
   251  	MOVQ	SI, g(CX)	// g = m->g0
   252  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   253  	PUSHQ	AX
   254  	MOVQ	DI, DX
   255  	MOVQ	0(DI), DI
   256  	CALL	DI
   257  	POPQ	AX
   258  	MOVQ	$runtime·badmcall2(SB), AX
   259  	JMP	AX
   260  	RET
   261  
   262  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   263  // of the G stack. We need to distinguish the routine that
   264  // lives at the bottom of the G stack from the one that lives
   265  // at the top of the system stack because the one at the top of
   266  // the system stack terminates the stack walk (see topofstack()).
   267  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   268  	RET
   269  
   270  // func systemstack(fn func())
   271  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   272  	MOVQ	fn+0(FP), DI	// DI = fn
   273  	get_tls(CX)
   274  	MOVQ	g(CX), AX	// AX = g
   275  	MOVQ	g_m(AX), BX	// BX = m
   276  
   277  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   278  	CMPQ	AX, DX
   279  	JEQ	noswitch
   280  
   281  	MOVQ	m_g0(BX), DX	// DX = g0
   282  	CMPQ	AX, DX
   283  	JEQ	noswitch
   284  
   285  	MOVQ	m_curg(BX), R8
   286  	CMPQ	AX, R8
   287  	JEQ	switch
   288  	
   289  	// Bad: g is not gsignal, not g0, not curg. What is it?
   290  	MOVQ	$runtime·badsystemstack(SB), AX
   291  	CALL	AX
   292  
   293  switch:
   294  	// save our state in g->sched. Pretend to
   295  	// be systemstack_switch if the G stack is scanned.
   296  	MOVQ	$runtime·systemstack_switch(SB), SI
   297  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   298  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   299  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   300  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   301  
   302  	// switch to g0
   303  	MOVQ	DX, g(CX)
   304  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   305  	// make it look like mstart called systemstack on g0, to stop traceback
   306  	SUBQ	$8, BX
   307  	MOVQ	$runtime·mstart(SB), DX
   308  	MOVQ	DX, 0(BX)
   309  	MOVQ	BX, SP
   310  
   311  	// call target function
   312  	MOVQ	DI, DX
   313  	MOVQ	0(DI), DI
   314  	CALL	DI
   315  
   316  	// switch back to g
   317  	get_tls(CX)
   318  	MOVQ	g(CX), AX
   319  	MOVQ	g_m(AX), BX
   320  	MOVQ	m_curg(BX), AX
   321  	MOVQ	AX, g(CX)
   322  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   323  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   324  	RET
   325  
   326  noswitch:
   327  	// already on m stack, just call directly
   328  	MOVQ	DI, DX
   329  	MOVQ	0(DI), DI
   330  	CALL	DI
   331  	RET
   332  
   333  /*
   334   * support for morestack
   335   */
   336  
   337  // Called during function prolog when more stack is needed.
   338  //
   339  // The traceback routines see morestack on a g0 as being
   340  // the top of a stack (for example, morestack calling newstack
   341  // calling the scheduler calling newm calling gc), so we must
   342  // record an argument size. For that purpose, it has no arguments.
   343  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   344  	// Cannot grow scheduler stack (m->g0).
   345  	get_tls(CX)
   346  	MOVQ	g(CX), BX
   347  	MOVQ	g_m(BX), BX
   348  	MOVQ	m_g0(BX), SI
   349  	CMPQ	g(CX), SI
   350  	JNE	3(PC)
   351  	CALL	runtime·badmorestackg0(SB)
   352  	INT	$3
   353  
   354  	// Cannot grow signal stack (m->gsignal).
   355  	MOVQ	m_gsignal(BX), SI
   356  	CMPQ	g(CX), SI
   357  	JNE	3(PC)
   358  	CALL	runtime·badmorestackgsignal(SB)
   359  	INT	$3
   360  
   361  	// Called from f.
   362  	// Set m->morebuf to f's caller.
   363  	MOVQ	8(SP), AX	// f's caller's PC
   364  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   365  	LEAQ	16(SP), AX	// f's caller's SP
   366  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   367  	get_tls(CX)
   368  	MOVQ	g(CX), SI
   369  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   370  
   371  	// Set g->sched to context in f.
   372  	MOVQ	0(SP), AX // f's PC
   373  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   374  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   375  	LEAQ	8(SP), AX // f's SP
   376  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   377  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   378  	// newstack will fill gobuf.ctxt.
   379  
   380  	// Call newstack on m->g0's stack.
   381  	MOVQ	m_g0(BX), BX
   382  	MOVQ	BX, g(CX)
   383  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   384  	PUSHQ	DX	// ctxt argument
   385  	CALL	runtime·newstack(SB)
   386  	MOVQ	$0, 0x1003	// crash if newstack returns
   387  	POPQ	DX	// keep balance check happy
   388  	RET
   389  
   390  // morestack but not preserving ctxt.
   391  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   392  	MOVL	$0, DX
   393  	JMP	runtime·morestack(SB)
   394  
   395  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   396  	// We came here via a RET to an overwritten return PC.
   397  	// AX may be live. Other registers are available.
   398  
   399  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   400  	get_tls(CX)
   401  	MOVQ	g(CX), CX
   402  	MOVQ	(g_stkbar+slice_array)(CX), DX
   403  	MOVQ	g_stkbarPos(CX), BX
   404  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   405  	MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
   406  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   407  	// Assert that we're popping the right saved LR.
   408  	ADDQ	$8, R8
   409  	CMPQ	R8, SP
   410  	JEQ	2(PC)
   411  	MOVL	$0, 0
   412  	// Record that this stack barrier was hit.
   413  	ADDQ	$1, g_stkbarPos(CX)
   414  	// Jump to the original return PC.
   415  	JMP	BX
   416  
   417  // reflectcall: call a function with the given argument list
   418  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   419  // we don't have variable-sized frames, so we use a small number
   420  // of constant-sized-frame functions to encode a few bits of size in the pc.
   421  // Caution: ugly multiline assembly macros in your future!
   422  
   423  #define DISPATCH(NAME,MAXSIZE)		\
   424  	CMPQ	CX, $MAXSIZE;		\
   425  	JA	3(PC);			\
   426  	MOVQ	$NAME(SB), AX;		\
   427  	JMP	AX
   428  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   429  
   430  TEXT reflect·call(SB), NOSPLIT, $0-0
   431  	JMP	·reflectcall(SB)
   432  
   433  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   434  	MOVLQZX argsize+24(FP), CX
   435  	DISPATCH(runtime·call32, 32)
   436  	DISPATCH(runtime·call64, 64)
   437  	DISPATCH(runtime·call128, 128)
   438  	DISPATCH(runtime·call256, 256)
   439  	DISPATCH(runtime·call512, 512)
   440  	DISPATCH(runtime·call1024, 1024)
   441  	DISPATCH(runtime·call2048, 2048)
   442  	DISPATCH(runtime·call4096, 4096)
   443  	DISPATCH(runtime·call8192, 8192)
   444  	DISPATCH(runtime·call16384, 16384)
   445  	DISPATCH(runtime·call32768, 32768)
   446  	DISPATCH(runtime·call65536, 65536)
   447  	DISPATCH(runtime·call131072, 131072)
   448  	DISPATCH(runtime·call262144, 262144)
   449  	DISPATCH(runtime·call524288, 524288)
   450  	DISPATCH(runtime·call1048576, 1048576)
   451  	DISPATCH(runtime·call2097152, 2097152)
   452  	DISPATCH(runtime·call4194304, 4194304)
   453  	DISPATCH(runtime·call8388608, 8388608)
   454  	DISPATCH(runtime·call16777216, 16777216)
   455  	DISPATCH(runtime·call33554432, 33554432)
   456  	DISPATCH(runtime·call67108864, 67108864)
   457  	DISPATCH(runtime·call134217728, 134217728)
   458  	DISPATCH(runtime·call268435456, 268435456)
   459  	DISPATCH(runtime·call536870912, 536870912)
   460  	DISPATCH(runtime·call1073741824, 1073741824)
   461  	MOVQ	$runtime·badreflectcall(SB), AX
   462  	JMP	AX
   463  
   464  #define CALLFN(NAME,MAXSIZE)			\
   465  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   466  	NO_LOCAL_POINTERS;			\
   467  	/* copy arguments to stack */		\
   468  	MOVQ	argptr+16(FP), SI;		\
   469  	MOVLQZX argsize+24(FP), CX;		\
   470  	MOVQ	SP, DI;				\
   471  	REP;MOVSB;				\
   472  	/* call function */			\
   473  	MOVQ	f+8(FP), DX;			\
   474  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   475  	CALL	(DX);				\
   476  	/* copy return values back */		\
   477  	MOVQ	argtype+0(FP), DX;		\
   478  	MOVQ	argptr+16(FP), DI;		\
   479  	MOVLQZX	argsize+24(FP), CX;		\
   480  	MOVLQZX	retoffset+28(FP), BX;		\
   481  	MOVQ	SP, SI;				\
   482  	ADDQ	BX, DI;				\
   483  	ADDQ	BX, SI;				\
   484  	SUBQ	BX, CX;				\
   485  	CALL	callRet<>(SB);			\
   486  	RET
   487  
   488  // callRet copies return values back at the end of call*. This is a
   489  // separate function so it can allocate stack space for the arguments
   490  // to reflectcallmove. It does not follow the Go ABI; it expects its
   491  // arguments in registers.
   492  TEXT callRet<>(SB), NOSPLIT, $32-0
   493  	NO_LOCAL_POINTERS
   494  	MOVQ	DX, 0(SP)
   495  	MOVQ	DI, 8(SP)
   496  	MOVQ	SI, 16(SP)
   497  	MOVQ	CX, 24(SP)
   498  	CALL	runtime·reflectcallmove(SB)
   499  	RET
   500  
   501  CALLFN(·call32, 32)
   502  CALLFN(·call64, 64)
   503  CALLFN(·call128, 128)
   504  CALLFN(·call256, 256)
   505  CALLFN(·call512, 512)
   506  CALLFN(·call1024, 1024)
   507  CALLFN(·call2048, 2048)
   508  CALLFN(·call4096, 4096)
   509  CALLFN(·call8192, 8192)
   510  CALLFN(·call16384, 16384)
   511  CALLFN(·call32768, 32768)
   512  CALLFN(·call65536, 65536)
   513  CALLFN(·call131072, 131072)
   514  CALLFN(·call262144, 262144)
   515  CALLFN(·call524288, 524288)
   516  CALLFN(·call1048576, 1048576)
   517  CALLFN(·call2097152, 2097152)
   518  CALLFN(·call4194304, 4194304)
   519  CALLFN(·call8388608, 8388608)
   520  CALLFN(·call16777216, 16777216)
   521  CALLFN(·call33554432, 33554432)
   522  CALLFN(·call67108864, 67108864)
   523  CALLFN(·call134217728, 134217728)
   524  CALLFN(·call268435456, 268435456)
   525  CALLFN(·call536870912, 536870912)
   526  CALLFN(·call1073741824, 1073741824)
   527  
   528  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   529  	MOVL	cycles+0(FP), AX
   530  again:
   531  	PAUSE
   532  	SUBL	$1, AX
   533  	JNZ	again
   534  	RET
   535  
   536  
   537  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   538  	// Stores are already ordered on x86, so this is just a
   539  	// compile barrier.
   540  	RET
   541  
   542  // void jmpdefer(fn, sp);
   543  // called from deferreturn.
   544  // 1. pop the caller
   545  // 2. sub 5 bytes from the callers return
   546  // 3. jmp to the argument
   547  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   548  	MOVQ	fv+0(FP), DX	// fn
   549  	MOVQ	argp+8(FP), BX	// caller sp
   550  	LEAQ	-8(BX), SP	// caller sp after CALL
   551  	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   552  	SUBQ	$5, (SP)	// return to CALL again
   553  	MOVQ	0(DX), BX
   554  	JMP	BX	// but first run the deferred function
   555  
   556  // Save state of caller into g->sched. Smashes R8, R9.
   557  TEXT gosave<>(SB),NOSPLIT,$0
   558  	get_tls(R8)
   559  	MOVQ	g(R8), R8
   560  	MOVQ	0(SP), R9
   561  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   562  	LEAQ	8(SP), R9
   563  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   564  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   565  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   566  	// Assert ctxt is zero. See func save.
   567  	MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   568  	TESTQ	R9, R9
   569  	JZ	2(PC)
   570  	CALL	runtime·badctxt(SB)
   571  	RET
   572  
   573  // func asmcgocall(fn, arg unsafe.Pointer) int32
   574  // Call fn(arg) on the scheduler stack,
   575  // aligned appropriately for the gcc ABI.
   576  // See cgocall.go for more details.
   577  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   578  	MOVQ	fn+0(FP), AX
   579  	MOVQ	arg+8(FP), BX
   580  
   581  	MOVQ	SP, DX
   582  
   583  	// Figure out if we need to switch to m->g0 stack.
   584  	// We get called to create new OS threads too, and those
   585  	// come in on the m->g0 stack already.
   586  	get_tls(CX)
   587  	MOVQ	g(CX), R8
   588  	CMPQ	R8, $0
   589  	JEQ	nosave
   590  	MOVQ	g_m(R8), R8
   591  	MOVQ	m_g0(R8), SI
   592  	MOVQ	g(CX), DI
   593  	CMPQ	SI, DI
   594  	JEQ	nosave
   595  	MOVQ	m_gsignal(R8), SI
   596  	CMPQ	SI, DI
   597  	JEQ	nosave
   598  	
   599  	// Switch to system stack.
   600  	MOVQ	m_g0(R8), SI
   601  	CALL	gosave<>(SB)
   602  	MOVQ	SI, g(CX)
   603  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   604  
   605  	// Now on a scheduling stack (a pthread-created stack).
   606  	// Make sure we have enough room for 4 stack-backed fast-call
   607  	// registers as per windows amd64 calling convention.
   608  	SUBQ	$64, SP
   609  	ANDQ	$~15, SP	// alignment for gcc ABI
   610  	MOVQ	DI, 48(SP)	// save g
   611  	MOVQ	(g_stack+stack_hi)(DI), DI
   612  	SUBQ	DX, DI
   613  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   614  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   615  	MOVQ	BX, CX		// CX = first argument in Win64
   616  	CALL	AX
   617  
   618  	// Restore registers, g, stack pointer.
   619  	get_tls(CX)
   620  	MOVQ	48(SP), DI
   621  	MOVQ	(g_stack+stack_hi)(DI), SI
   622  	SUBQ	40(SP), SI
   623  	MOVQ	DI, g(CX)
   624  	MOVQ	SI, SP
   625  
   626  	MOVL	AX, ret+16(FP)
   627  	RET
   628  
   629  nosave:
   630  	// Running on a system stack, perhaps even without a g.
   631  	// Having no g can happen during thread creation or thread teardown
   632  	// (see needm/dropm on Solaris, for example).
   633  	// This code is like the above sequence but without saving/restoring g
   634  	// and without worrying about the stack moving out from under us
   635  	// (because we're on a system stack, not a goroutine stack).
   636  	// The above code could be used directly if already on a system stack,
   637  	// but then the only path through this code would be a rare case on Solaris.
   638  	// Using this code for all "already on system stack" calls exercises it more,
   639  	// which should help keep it correct.
   640  	SUBQ	$64, SP
   641  	ANDQ	$~15, SP
   642  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   643  	MOVQ	DX, 40(SP)	// save original stack pointer
   644  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   645  	MOVQ	BX, CX		// CX = first argument in Win64
   646  	CALL	AX
   647  	MOVQ	40(SP), SI	// restore original stack pointer
   648  	MOVQ	SI, SP
   649  	MOVL	AX, ret+16(FP)
   650  	RET
   651  
   652  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   653  // Turn the fn into a Go func (by taking its address) and call
   654  // cgocallback_gofunc.
   655  TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   656  	LEAQ	fn+0(FP), AX
   657  	MOVQ	AX, 0(SP)
   658  	MOVQ	frame+8(FP), AX
   659  	MOVQ	AX, 8(SP)
   660  	MOVQ	framesize+16(FP), AX
   661  	MOVQ	AX, 16(SP)
   662  	MOVQ	ctxt+24(FP), AX
   663  	MOVQ	AX, 24(SP)
   664  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   665  	CALL	AX
   666  	RET
   667  
   668  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   669  // See cgocall.go for more details.
   670  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   671  	NO_LOCAL_POINTERS
   672  
   673  	// If g is nil, Go did not create the current thread.
   674  	// Call needm to obtain one m for temporary use.
   675  	// In this case, we're running on the thread stack, so there's
   676  	// lots of space, but the linker doesn't know. Hide the call from
   677  	// the linker analysis by using an indirect call through AX.
   678  	get_tls(CX)
   679  #ifdef GOOS_windows
   680  	MOVL	$0, BX
   681  	CMPQ	CX, $0
   682  	JEQ	2(PC)
   683  #endif
   684  	MOVQ	g(CX), BX
   685  	CMPQ	BX, $0
   686  	JEQ	needm
   687  	MOVQ	g_m(BX), BX
   688  	MOVQ	BX, R8 // holds oldm until end of function
   689  	JMP	havem
   690  needm:
   691  	MOVQ	$0, 0(SP)
   692  	MOVQ	$runtime·needm(SB), AX
   693  	CALL	AX
   694  	MOVQ	0(SP), R8
   695  	get_tls(CX)
   696  	MOVQ	g(CX), BX
   697  	MOVQ	g_m(BX), BX
   698  	
   699  	// Set m->sched.sp = SP, so that if a panic happens
   700  	// during the function we are about to execute, it will
   701  	// have a valid SP to run on the g0 stack.
   702  	// The next few lines (after the havem label)
   703  	// will save this SP onto the stack and then write
   704  	// the same SP back to m->sched.sp. That seems redundant,
   705  	// but if an unrecovered panic happens, unwindm will
   706  	// restore the g->sched.sp from the stack location
   707  	// and then systemstack will try to use it. If we don't set it here,
   708  	// that restored SP will be uninitialized (typically 0) and
   709  	// will not be usable.
   710  	MOVQ	m_g0(BX), SI
   711  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   712  
   713  havem:
   714  	// Now there's a valid m, and we're running on its m->g0.
   715  	// Save current m->g0->sched.sp on stack and then set it to SP.
   716  	// Save current sp in m->g0->sched.sp in preparation for
   717  	// switch back to m->curg stack.
   718  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   719  	MOVQ	m_g0(BX), SI
   720  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   721  	MOVQ	AX, 0(SP)
   722  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   723  
   724  	// Switch to m->curg stack and call runtime.cgocallbackg.
   725  	// Because we are taking over the execution of m->curg
   726  	// but *not* resuming what had been running, we need to
   727  	// save that information (m->curg->sched) so we can restore it.
   728  	// We can restore m->curg->sched.sp easily, because calling
   729  	// runtime.cgocallbackg leaves SP unchanged upon return.
   730  	// To save m->curg->sched.pc, we push it onto the stack.
   731  	// This has the added benefit that it looks to the traceback
   732  	// routine like cgocallbackg is going to return to that
   733  	// PC (because the frame we allocate below has the same
   734  	// size as cgocallback_gofunc's frame declared above)
   735  	// so that the traceback will seamlessly trace back into
   736  	// the earlier calls.
   737  	//
   738  	// In the new goroutine, 8(SP) holds the saved R8.
   739  	MOVQ	m_curg(BX), SI
   740  	MOVQ	SI, g(CX)
   741  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   742  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   743  	MOVQ	BX, -8(DI)
   744  	// Compute the size of the frame, including return PC and, if
   745  	// GOEXPERIMENT=framepointer, the saved based pointer
   746  	MOVQ	ctxt+24(FP), BX
   747  	LEAQ	fv+0(FP), AX
   748  	SUBQ	SP, AX
   749  	SUBQ	AX, DI
   750  	MOVQ	DI, SP
   751  
   752  	MOVQ	R8, 8(SP)
   753  	MOVQ	BX, 0(SP)
   754  	CALL	runtime·cgocallbackg(SB)
   755  	MOVQ	8(SP), R8
   756  
   757  	// Compute the size of the frame again. FP and SP have
   758  	// completely different values here than they did above,
   759  	// but only their difference matters.
   760  	LEAQ	fv+0(FP), AX
   761  	SUBQ	SP, AX
   762  
   763  	// Restore g->sched (== m->curg->sched) from saved values.
   764  	get_tls(CX)
   765  	MOVQ	g(CX), SI
   766  	MOVQ	SP, DI
   767  	ADDQ	AX, DI
   768  	MOVQ	-8(DI), BX
   769  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   770  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   771  
   772  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   773  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   774  	// so we do not have to restore it.)
   775  	MOVQ	g(CX), BX
   776  	MOVQ	g_m(BX), BX
   777  	MOVQ	m_g0(BX), SI
   778  	MOVQ	SI, g(CX)
   779  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   780  	MOVQ	0(SP), AX
   781  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   782  	
   783  	// If the m on entry was nil, we called needm above to borrow an m
   784  	// for the duration of the call. Since the call is over, return it with dropm.
   785  	CMPQ	R8, $0
   786  	JNE 3(PC)
   787  	MOVQ	$runtime·dropm(SB), AX
   788  	CALL	AX
   789  
   790  	// Done!
   791  	RET
   792  
   793  // void setg(G*); set g. for use by needm.
   794  TEXT runtime·setg(SB), NOSPLIT, $0-8
   795  	MOVQ	gg+0(FP), BX
   796  #ifdef GOOS_windows
   797  	CMPQ	BX, $0
   798  	JNE	settls
   799  	MOVQ	$0, 0x28(GS)
   800  	RET
   801  settls:
   802  	MOVQ	g_m(BX), AX
   803  	LEAQ	m_tls(AX), AX
   804  	MOVQ	AX, 0x28(GS)
   805  #endif
   806  	get_tls(CX)
   807  	MOVQ	BX, g(CX)
   808  	RET
   809  
   810  // void setg_gcc(G*); set g called from gcc.
   811  TEXT setg_gcc<>(SB),NOSPLIT,$0
   812  	get_tls(AX)
   813  	MOVQ	DI, g(AX)
   814  	RET
   815  
   816  // check that SP is in range [g->stack.lo, g->stack.hi)
   817  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   818  	get_tls(CX)
   819  	MOVQ	g(CX), AX
   820  	CMPQ	(g_stack+stack_hi)(AX), SP
   821  	JHI	2(PC)
   822  	INT	$3
   823  	CMPQ	SP, (g_stack+stack_lo)(AX)
   824  	JHI	2(PC)
   825  	INT	$3
   826  	RET
   827  
   828  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   829  	MOVQ	argp+0(FP),AX		// addr of first arg
   830  	MOVQ	-8(AX),AX		// get calling pc
   831  	CMPQ	AX, runtime·stackBarrierPC(SB)
   832  	JNE	nobar
   833  	// Get original return PC.
   834  	CALL	runtime·nextBarrierPC(SB)
   835  	MOVQ	0(SP), AX
   836  nobar:
   837  	MOVQ	AX, ret+8(FP)
   838  	RET
   839  
   840  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   841  	MOVQ	argp+0(FP),AX		// addr of first arg
   842  	MOVQ	pc+8(FP), BX
   843  	MOVQ	-8(AX), CX
   844  	CMPQ	CX, runtime·stackBarrierPC(SB)
   845  	JEQ	setbar
   846  	MOVQ	BX, -8(AX)		// set calling pc
   847  	RET
   848  setbar:
   849  	// Set the stack barrier return PC.
   850  	MOVQ	BX, 0(SP)
   851  	CALL	runtime·setNextBarrierPC(SB)
   852  	RET
   853  
   854  // func cputicks() int64
   855  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   856  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   857  	JNE	mfence
   858  	LFENCE
   859  	JMP	done
   860  mfence:
   861  	MFENCE
   862  done:
   863  	RDTSC
   864  	SHLQ	$32, DX
   865  	ADDQ	DX, AX
   866  	MOVQ	AX, ret+0(FP)
   867  	RET
   868  
   869  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   870  // redirects to memhash(p, h, size) using the size
   871  // stored in the closure.
   872  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   873  	GO_ARGS
   874  	NO_LOCAL_POINTERS
   875  	MOVQ	p+0(FP), AX
   876  	MOVQ	h+8(FP), BX
   877  	MOVQ	8(DX), CX
   878  	MOVQ	AX, 0(SP)
   879  	MOVQ	BX, 8(SP)
   880  	MOVQ	CX, 16(SP)
   881  	CALL	runtime·memhash(SB)
   882  	MOVQ	24(SP), AX
   883  	MOVQ	AX, ret+16(FP)
   884  	RET
   885  
   886  // hash function using AES hardware instructions
   887  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   888  	MOVQ	p+0(FP), AX	// ptr to data
   889  	MOVQ	s+16(FP), CX	// size
   890  	LEAQ	ret+24(FP), DX
   891  	JMP	runtime·aeshashbody(SB)
   892  
   893  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   894  	MOVQ	p+0(FP), AX	// ptr to string struct
   895  	MOVQ	8(AX), CX	// length of string
   896  	MOVQ	(AX), AX	// string data
   897  	LEAQ	ret+16(FP), DX
   898  	JMP	runtime·aeshashbody(SB)
   899  
   900  // AX: data
   901  // CX: length
   902  // DX: address to put return value
   903  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   904  	// Fill an SSE register with our seeds.
   905  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   906  	PINSRW	$4, CX, X0			// 16 bits of length
   907  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   908  	MOVO	X0, X1				// save unscrambled seed
   909  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   910  	AESENC	X0, X0				// scramble seed
   911  
   912  	CMPQ	CX, $16
   913  	JB	aes0to15
   914  	JE	aes16
   915  	CMPQ	CX, $32
   916  	JBE	aes17to32
   917  	CMPQ	CX, $64
   918  	JBE	aes33to64
   919  	CMPQ	CX, $128
   920  	JBE	aes65to128
   921  	JMP	aes129plus
   922  
   923  aes0to15:
   924  	TESTQ	CX, CX
   925  	JE	aes0
   926  
   927  	ADDQ	$16, AX
   928  	TESTW	$0xff0, AX
   929  	JE	endofpage
   930  
   931  	// 16 bytes loaded at this address won't cross
   932  	// a page boundary, so we can load it directly.
   933  	MOVOU	-16(AX), X1
   934  	ADDQ	CX, CX
   935  	MOVQ	$masks<>(SB), AX
   936  	PAND	(AX)(CX*8), X1
   937  final1:
   938  	PXOR	X0, X1	// xor data with seed
   939  	AESENC	X1, X1	// scramble combo 3 times
   940  	AESENC	X1, X1
   941  	AESENC	X1, X1
   942  	MOVQ	X1, (DX)
   943  	RET
   944  
   945  endofpage:
   946  	// address ends in 1111xxxx. Might be up against
   947  	// a page boundary, so load ending at last byte.
   948  	// Then shift bytes down using pshufb.
   949  	MOVOU	-32(AX)(CX*1), X1
   950  	ADDQ	CX, CX
   951  	MOVQ	$shifts<>(SB), AX
   952  	PSHUFB	(AX)(CX*8), X1
   953  	JMP	final1
   954  
   955  aes0:
   956  	// Return scrambled input seed
   957  	AESENC	X0, X0
   958  	MOVQ	X0, (DX)
   959  	RET
   960  
   961  aes16:
   962  	MOVOU	(AX), X1
   963  	JMP	final1
   964  
   965  aes17to32:
   966  	// make second starting seed
   967  	PXOR	runtime·aeskeysched+16(SB), X1
   968  	AESENC	X1, X1
   969  	
   970  	// load data to be hashed
   971  	MOVOU	(AX), X2
   972  	MOVOU	-16(AX)(CX*1), X3
   973  
   974  	// xor with seed
   975  	PXOR	X0, X2
   976  	PXOR	X1, X3
   977  
   978  	// scramble 3 times
   979  	AESENC	X2, X2
   980  	AESENC	X3, X3
   981  	AESENC	X2, X2
   982  	AESENC	X3, X3
   983  	AESENC	X2, X2
   984  	AESENC	X3, X3
   985  
   986  	// combine results
   987  	PXOR	X3, X2
   988  	MOVQ	X2, (DX)
   989  	RET
   990  
   991  aes33to64:
   992  	// make 3 more starting seeds
   993  	MOVO	X1, X2
   994  	MOVO	X1, X3
   995  	PXOR	runtime·aeskeysched+16(SB), X1
   996  	PXOR	runtime·aeskeysched+32(SB), X2
   997  	PXOR	runtime·aeskeysched+48(SB), X3
   998  	AESENC	X1, X1
   999  	AESENC	X2, X2
  1000  	AESENC	X3, X3
  1001  	
  1002  	MOVOU	(AX), X4
  1003  	MOVOU	16(AX), X5
  1004  	MOVOU	-32(AX)(CX*1), X6
  1005  	MOVOU	-16(AX)(CX*1), X7
  1006  
  1007  	PXOR	X0, X4
  1008  	PXOR	X1, X5
  1009  	PXOR	X2, X6
  1010  	PXOR	X3, X7
  1011  	
  1012  	AESENC	X4, X4
  1013  	AESENC	X5, X5
  1014  	AESENC	X6, X6
  1015  	AESENC	X7, X7
  1016  	
  1017  	AESENC	X4, X4
  1018  	AESENC	X5, X5
  1019  	AESENC	X6, X6
  1020  	AESENC	X7, X7
  1021  	
  1022  	AESENC	X4, X4
  1023  	AESENC	X5, X5
  1024  	AESENC	X6, X6
  1025  	AESENC	X7, X7
  1026  
  1027  	PXOR	X6, X4
  1028  	PXOR	X7, X5
  1029  	PXOR	X5, X4
  1030  	MOVQ	X4, (DX)
  1031  	RET
  1032  
  1033  aes65to128:
  1034  	// make 7 more starting seeds
  1035  	MOVO	X1, X2
  1036  	MOVO	X1, X3
  1037  	MOVO	X1, X4
  1038  	MOVO	X1, X5
  1039  	MOVO	X1, X6
  1040  	MOVO	X1, X7
  1041  	PXOR	runtime·aeskeysched+16(SB), X1
  1042  	PXOR	runtime·aeskeysched+32(SB), X2
  1043  	PXOR	runtime·aeskeysched+48(SB), X3
  1044  	PXOR	runtime·aeskeysched+64(SB), X4
  1045  	PXOR	runtime·aeskeysched+80(SB), X5
  1046  	PXOR	runtime·aeskeysched+96(SB), X6
  1047  	PXOR	runtime·aeskeysched+112(SB), X7
  1048  	AESENC	X1, X1
  1049  	AESENC	X2, X2
  1050  	AESENC	X3, X3
  1051  	AESENC	X4, X4
  1052  	AESENC	X5, X5
  1053  	AESENC	X6, X6
  1054  	AESENC	X7, X7
  1055  
  1056  	// load data
  1057  	MOVOU	(AX), X8
  1058  	MOVOU	16(AX), X9
  1059  	MOVOU	32(AX), X10
  1060  	MOVOU	48(AX), X11
  1061  	MOVOU	-64(AX)(CX*1), X12
  1062  	MOVOU	-48(AX)(CX*1), X13
  1063  	MOVOU	-32(AX)(CX*1), X14
  1064  	MOVOU	-16(AX)(CX*1), X15
  1065  
  1066  	// xor with seed
  1067  	PXOR	X0, X8
  1068  	PXOR	X1, X9
  1069  	PXOR	X2, X10
  1070  	PXOR	X3, X11
  1071  	PXOR	X4, X12
  1072  	PXOR	X5, X13
  1073  	PXOR	X6, X14
  1074  	PXOR	X7, X15
  1075  
  1076  	// scramble 3 times
  1077  	AESENC	X8, X8
  1078  	AESENC	X9, X9
  1079  	AESENC	X10, X10
  1080  	AESENC	X11, X11
  1081  	AESENC	X12, X12
  1082  	AESENC	X13, X13
  1083  	AESENC	X14, X14
  1084  	AESENC	X15, X15
  1085  
  1086  	AESENC	X8, X8
  1087  	AESENC	X9, X9
  1088  	AESENC	X10, X10
  1089  	AESENC	X11, X11
  1090  	AESENC	X12, X12
  1091  	AESENC	X13, X13
  1092  	AESENC	X14, X14
  1093  	AESENC	X15, X15
  1094  
  1095  	AESENC	X8, X8
  1096  	AESENC	X9, X9
  1097  	AESENC	X10, X10
  1098  	AESENC	X11, X11
  1099  	AESENC	X12, X12
  1100  	AESENC	X13, X13
  1101  	AESENC	X14, X14
  1102  	AESENC	X15, X15
  1103  
  1104  	// combine results
  1105  	PXOR	X12, X8
  1106  	PXOR	X13, X9
  1107  	PXOR	X14, X10
  1108  	PXOR	X15, X11
  1109  	PXOR	X10, X8
  1110  	PXOR	X11, X9
  1111  	PXOR	X9, X8
  1112  	MOVQ	X8, (DX)
  1113  	RET
  1114  
  1115  aes129plus:
  1116  	// make 7 more starting seeds
  1117  	MOVO	X1, X2
  1118  	MOVO	X1, X3
  1119  	MOVO	X1, X4
  1120  	MOVO	X1, X5
  1121  	MOVO	X1, X6
  1122  	MOVO	X1, X7
  1123  	PXOR	runtime·aeskeysched+16(SB), X1
  1124  	PXOR	runtime·aeskeysched+32(SB), X2
  1125  	PXOR	runtime·aeskeysched+48(SB), X3
  1126  	PXOR	runtime·aeskeysched+64(SB), X4
  1127  	PXOR	runtime·aeskeysched+80(SB), X5
  1128  	PXOR	runtime·aeskeysched+96(SB), X6
  1129  	PXOR	runtime·aeskeysched+112(SB), X7
  1130  	AESENC	X1, X1
  1131  	AESENC	X2, X2
  1132  	AESENC	X3, X3
  1133  	AESENC	X4, X4
  1134  	AESENC	X5, X5
  1135  	AESENC	X6, X6
  1136  	AESENC	X7, X7
  1137  	
  1138  	// start with last (possibly overlapping) block
  1139  	MOVOU	-128(AX)(CX*1), X8
  1140  	MOVOU	-112(AX)(CX*1), X9
  1141  	MOVOU	-96(AX)(CX*1), X10
  1142  	MOVOU	-80(AX)(CX*1), X11
  1143  	MOVOU	-64(AX)(CX*1), X12
  1144  	MOVOU	-48(AX)(CX*1), X13
  1145  	MOVOU	-32(AX)(CX*1), X14
  1146  	MOVOU	-16(AX)(CX*1), X15
  1147  
  1148  	// xor in seed
  1149  	PXOR	X0, X8
  1150  	PXOR	X1, X9
  1151  	PXOR	X2, X10
  1152  	PXOR	X3, X11
  1153  	PXOR	X4, X12
  1154  	PXOR	X5, X13
  1155  	PXOR	X6, X14
  1156  	PXOR	X7, X15
  1157  	
  1158  	// compute number of remaining 128-byte blocks
  1159  	DECQ	CX
  1160  	SHRQ	$7, CX
  1161  	
  1162  aesloop:
  1163  	// scramble state
  1164  	AESENC	X8, X8
  1165  	AESENC	X9, X9
  1166  	AESENC	X10, X10
  1167  	AESENC	X11, X11
  1168  	AESENC	X12, X12
  1169  	AESENC	X13, X13
  1170  	AESENC	X14, X14
  1171  	AESENC	X15, X15
  1172  
  1173  	// scramble state, xor in a block
  1174  	MOVOU	(AX), X0
  1175  	MOVOU	16(AX), X1
  1176  	MOVOU	32(AX), X2
  1177  	MOVOU	48(AX), X3
  1178  	AESENC	X0, X8
  1179  	AESENC	X1, X9
  1180  	AESENC	X2, X10
  1181  	AESENC	X3, X11
  1182  	MOVOU	64(AX), X4
  1183  	MOVOU	80(AX), X5
  1184  	MOVOU	96(AX), X6
  1185  	MOVOU	112(AX), X7
  1186  	AESENC	X4, X12
  1187  	AESENC	X5, X13
  1188  	AESENC	X6, X14
  1189  	AESENC	X7, X15
  1190  
  1191  	ADDQ	$128, AX
  1192  	DECQ	CX
  1193  	JNE	aesloop
  1194  
  1195  	// 3 more scrambles to finish
  1196  	AESENC	X8, X8
  1197  	AESENC	X9, X9
  1198  	AESENC	X10, X10
  1199  	AESENC	X11, X11
  1200  	AESENC	X12, X12
  1201  	AESENC	X13, X13
  1202  	AESENC	X14, X14
  1203  	AESENC	X15, X15
  1204  	AESENC	X8, X8
  1205  	AESENC	X9, X9
  1206  	AESENC	X10, X10
  1207  	AESENC	X11, X11
  1208  	AESENC	X12, X12
  1209  	AESENC	X13, X13
  1210  	AESENC	X14, X14
  1211  	AESENC	X15, X15
  1212  	AESENC	X8, X8
  1213  	AESENC	X9, X9
  1214  	AESENC	X10, X10
  1215  	AESENC	X11, X11
  1216  	AESENC	X12, X12
  1217  	AESENC	X13, X13
  1218  	AESENC	X14, X14
  1219  	AESENC	X15, X15
  1220  
  1221  	PXOR	X12, X8
  1222  	PXOR	X13, X9
  1223  	PXOR	X14, X10
  1224  	PXOR	X15, X11
  1225  	PXOR	X10, X8
  1226  	PXOR	X11, X9
  1227  	PXOR	X9, X8
  1228  	MOVQ	X8, (DX)
  1229  	RET
  1230  	
  1231  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1232  	MOVQ	p+0(FP), AX	// ptr to data
  1233  	MOVQ	h+8(FP), X0	// seed
  1234  	PINSRD	$2, (AX), X0	// data
  1235  	AESENC	runtime·aeskeysched+0(SB), X0
  1236  	AESENC	runtime·aeskeysched+16(SB), X0
  1237  	AESENC	runtime·aeskeysched+32(SB), X0
  1238  	MOVQ	X0, ret+16(FP)
  1239  	RET
  1240  
  1241  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1242  	MOVQ	p+0(FP), AX	// ptr to data
  1243  	MOVQ	h+8(FP), X0	// seed
  1244  	PINSRQ	$1, (AX), X0	// data
  1245  	AESENC	runtime·aeskeysched+0(SB), X0
  1246  	AESENC	runtime·aeskeysched+16(SB), X0
  1247  	AESENC	runtime·aeskeysched+32(SB), X0
  1248  	MOVQ	X0, ret+16(FP)
  1249  	RET
  1250  
  1251  // simple mask to get rid of data in the high part of the register.
  1252  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1253  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1254  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1255  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1256  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1257  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1258  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1259  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1260  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1261  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1262  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1263  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1264  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1265  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1266  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1267  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1268  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1269  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1270  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1271  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1272  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1273  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1274  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1275  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1276  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1277  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1278  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1279  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1280  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1281  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1282  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1283  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1284  GLOBL masks<>(SB),RODATA,$256
  1285  
  1286  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1287  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1288  	MOVQ	$masks<>(SB), AX
  1289  	MOVQ	$shifts<>(SB), BX
  1290  	ORQ	BX, AX
  1291  	TESTQ	$15, AX
  1292  	SETEQ	ret+0(FP)
  1293  	RET
  1294  
  1295  // these are arguments to pshufb. They move data down from
  1296  // the high bytes of the register to the low bytes of the register.
  1297  // index is how many bytes to move.
  1298  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1299  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1300  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1301  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1302  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1303  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1304  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1305  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1306  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1307  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1308  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1309  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1310  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1311  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1312  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1313  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1314  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1315  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1316  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1317  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1318  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1319  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1320  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1321  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1322  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1323  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1324  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1325  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1326  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1327  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1328  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1329  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1330  GLOBL shifts<>(SB),RODATA,$256
  1331  
  1332  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1333  TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1334  	MOVQ	a+0(FP), SI
  1335  	MOVQ	b+8(FP), DI
  1336  	CMPQ	SI, DI
  1337  	JEQ	eq
  1338  	MOVQ	size+16(FP), BX
  1339  	LEAQ	ret+24(FP), AX
  1340  	JMP	runtime·memeqbody(SB)
  1341  eq:
  1342  	MOVB	$1, ret+24(FP)
  1343  	RET
  1344  
  1345  // memequal_varlen(a, b unsafe.Pointer) bool
  1346  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1347  	MOVQ	a+0(FP), SI
  1348  	MOVQ	b+8(FP), DI
  1349  	CMPQ	SI, DI
  1350  	JEQ	eq
  1351  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1352  	LEAQ	ret+16(FP), AX
  1353  	JMP	runtime·memeqbody(SB)
  1354  eq:
  1355  	MOVB	$1, ret+16(FP)
  1356  	RET
  1357  
  1358  // eqstring tests whether two strings are equal.
  1359  // The compiler guarantees that strings passed
  1360  // to eqstring have equal length.
  1361  // See runtime_test.go:eqstring_generic for
  1362  // equivalent Go code.
  1363  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1364  	MOVQ	s1_base+0(FP), SI
  1365  	MOVQ	s2_base+16(FP), DI
  1366  	CMPQ	SI, DI
  1367  	JEQ	eq
  1368  	MOVQ	s1_len+8(FP), BX
  1369  	LEAQ	ret+32(FP), AX
  1370  	JMP	runtime·memeqbody(SB)
  1371  eq:
  1372  	MOVB	$1, ret+32(FP)
  1373  	RET
  1374  
  1375  // a in SI
  1376  // b in DI
  1377  // count in BX
  1378  // address of result byte in AX
  1379  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1380  	CMPQ	BX, $8
  1381  	JB	small
  1382  	CMPQ	BX, $64
  1383  	JB	bigloop
  1384  	CMPB    runtime·support_avx2(SB), $1
  1385  	JE	hugeloop_avx2
  1386  	
  1387  	// 64 bytes at a time using xmm registers
  1388  hugeloop:
  1389  	CMPQ	BX, $64
  1390  	JB	bigloop
  1391  	MOVOU	(SI), X0
  1392  	MOVOU	(DI), X1
  1393  	MOVOU	16(SI), X2
  1394  	MOVOU	16(DI), X3
  1395  	MOVOU	32(SI), X4
  1396  	MOVOU	32(DI), X5
  1397  	MOVOU	48(SI), X6
  1398  	MOVOU	48(DI), X7
  1399  	PCMPEQB	X1, X0
  1400  	PCMPEQB	X3, X2
  1401  	PCMPEQB	X5, X4
  1402  	PCMPEQB	X7, X6
  1403  	PAND	X2, X0
  1404  	PAND	X6, X4
  1405  	PAND	X4, X0
  1406  	PMOVMSKB X0, DX
  1407  	ADDQ	$64, SI
  1408  	ADDQ	$64, DI
  1409  	SUBQ	$64, BX
  1410  	CMPL	DX, $0xffff
  1411  	JEQ	hugeloop
  1412  	MOVB	$0, (AX)
  1413  	RET
  1414  
  1415  	// 64 bytes at a time using ymm registers
  1416  hugeloop_avx2:
  1417  	CMPQ	BX, $64
  1418  	JB	bigloop_avx2
  1419  	VMOVDQU	(SI), Y0
  1420  	VMOVDQU	(DI), Y1
  1421  	VMOVDQU	32(SI), Y2
  1422  	VMOVDQU	32(DI), Y3
  1423  	VPCMPEQB	Y1, Y0, Y4
  1424  	VPCMPEQB	Y2, Y3, Y5
  1425  	VPAND	Y4, Y5, Y6
  1426  	VPMOVMSKB Y6, DX
  1427  	ADDQ	$64, SI
  1428  	ADDQ	$64, DI
  1429  	SUBQ	$64, BX
  1430  	CMPL	DX, $0xffffffff
  1431  	JEQ	hugeloop_avx2
  1432  	VZEROUPPER
  1433  	MOVB	$0, (AX)
  1434  	RET
  1435  
  1436  bigloop_avx2:
  1437  	VZEROUPPER
  1438  
  1439  	// 8 bytes at a time using 64-bit register
  1440  bigloop:
  1441  	CMPQ	BX, $8
  1442  	JBE	leftover
  1443  	MOVQ	(SI), CX
  1444  	MOVQ	(DI), DX
  1445  	ADDQ	$8, SI
  1446  	ADDQ	$8, DI
  1447  	SUBQ	$8, BX
  1448  	CMPQ	CX, DX
  1449  	JEQ	bigloop
  1450  	MOVB	$0, (AX)
  1451  	RET
  1452  
  1453  	// remaining 0-8 bytes
  1454  leftover:
  1455  	MOVQ	-8(SI)(BX*1), CX
  1456  	MOVQ	-8(DI)(BX*1), DX
  1457  	CMPQ	CX, DX
  1458  	SETEQ	(AX)
  1459  	RET
  1460  
  1461  small:
  1462  	CMPQ	BX, $0
  1463  	JEQ	equal
  1464  
  1465  	LEAQ	0(BX*8), CX
  1466  	NEGQ	CX
  1467  
  1468  	CMPB	SI, $0xf8
  1469  	JA	si_high
  1470  
  1471  	// load at SI won't cross a page boundary.
  1472  	MOVQ	(SI), SI
  1473  	JMP	si_finish
  1474  si_high:
  1475  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1476  	MOVQ	-8(SI)(BX*1), SI
  1477  	SHRQ	CX, SI
  1478  si_finish:
  1479  
  1480  	// same for DI.
  1481  	CMPB	DI, $0xf8
  1482  	JA	di_high
  1483  	MOVQ	(DI), DI
  1484  	JMP	di_finish
  1485  di_high:
  1486  	MOVQ	-8(DI)(BX*1), DI
  1487  	SHRQ	CX, DI
  1488  di_finish:
  1489  
  1490  	SUBQ	SI, DI
  1491  	SHLQ	CX, DI
  1492  equal:
  1493  	SETEQ	(AX)
  1494  	RET
  1495  
  1496  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1497  	MOVQ	s1_base+0(FP), SI
  1498  	MOVQ	s1_len+8(FP), BX
  1499  	MOVQ	s2_base+16(FP), DI
  1500  	MOVQ	s2_len+24(FP), DX
  1501  	LEAQ	ret+32(FP), R9
  1502  	JMP	runtime·cmpbody(SB)
  1503  
  1504  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1505  	MOVQ	s1+0(FP), SI
  1506  	MOVQ	s1+8(FP), BX
  1507  	MOVQ	s2+24(FP), DI
  1508  	MOVQ	s2+32(FP), DX
  1509  	LEAQ	res+48(FP), R9
  1510  	JMP	runtime·cmpbody(SB)
  1511  
  1512  // input:
  1513  //   SI = a
  1514  //   DI = b
  1515  //   BX = alen
  1516  //   DX = blen
  1517  //   R9 = address of output word (stores -1/0/1 here)
  1518  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1519  	CMPQ	SI, DI
  1520  	JEQ	allsame
  1521  	CMPQ	BX, DX
  1522  	MOVQ	DX, R8
  1523  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1524  	CMPQ	R8, $8
  1525  	JB	small
  1526  
  1527  	CMPQ	R8, $63
  1528  	JBE	loop
  1529  	CMPB    runtime·support_avx2(SB), $1
  1530  	JEQ     big_loop_avx2
  1531  	JMP	big_loop
  1532  loop:
  1533  	CMPQ	R8, $16
  1534  	JBE	_0through16
  1535  	MOVOU	(SI), X0
  1536  	MOVOU	(DI), X1
  1537  	PCMPEQB X0, X1
  1538  	PMOVMSKB X1, AX
  1539  	XORQ	$0xffff, AX	// convert EQ to NE
  1540  	JNE	diff16	// branch if at least one byte is not equal
  1541  	ADDQ	$16, SI
  1542  	ADDQ	$16, DI
  1543  	SUBQ	$16, R8
  1544  	JMP	loop
  1545  	
  1546  diff64:
  1547  	ADDQ	$48, SI
  1548  	ADDQ	$48, DI
  1549  	JMP	diff16
  1550  diff48:
  1551  	ADDQ	$32, SI
  1552  	ADDQ	$32, DI
  1553  	JMP	diff16
  1554  diff32:
  1555  	ADDQ	$16, SI
  1556  	ADDQ	$16, DI
  1557  	// AX = bit mask of differences
  1558  diff16:
  1559  	BSFQ	AX, BX	// index of first byte that differs
  1560  	XORQ	AX, AX
  1561  	MOVB	(SI)(BX*1), CX
  1562  	CMPB	CX, (DI)(BX*1)
  1563  	SETHI	AX
  1564  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1565  	MOVQ	AX, (R9)
  1566  	RET
  1567  
  1568  	// 0 through 16 bytes left, alen>=8, blen>=8
  1569  _0through16:
  1570  	CMPQ	R8, $8
  1571  	JBE	_0through8
  1572  	MOVQ	(SI), AX
  1573  	MOVQ	(DI), CX
  1574  	CMPQ	AX, CX
  1575  	JNE	diff8
  1576  _0through8:
  1577  	MOVQ	-8(SI)(R8*1), AX
  1578  	MOVQ	-8(DI)(R8*1), CX
  1579  	CMPQ	AX, CX
  1580  	JEQ	allsame
  1581  
  1582  	// AX and CX contain parts of a and b that differ.
  1583  diff8:
  1584  	BSWAPQ	AX	// reverse order of bytes
  1585  	BSWAPQ	CX
  1586  	XORQ	AX, CX
  1587  	BSRQ	CX, CX	// index of highest bit difference
  1588  	SHRQ	CX, AX	// move a's bit to bottom
  1589  	ANDQ	$1, AX	// mask bit
  1590  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1591  	MOVQ	AX, (R9)
  1592  	RET
  1593  
  1594  	// 0-7 bytes in common
  1595  small:
  1596  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1597  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1598  	JEQ	allsame
  1599  
  1600  	// load bytes of a into high bytes of AX
  1601  	CMPB	SI, $0xf8
  1602  	JA	si_high
  1603  	MOVQ	(SI), SI
  1604  	JMP	si_finish
  1605  si_high:
  1606  	MOVQ	-8(SI)(R8*1), SI
  1607  	SHRQ	CX, SI
  1608  si_finish:
  1609  	SHLQ	CX, SI
  1610  
  1611  	// load bytes of b in to high bytes of BX
  1612  	CMPB	DI, $0xf8
  1613  	JA	di_high
  1614  	MOVQ	(DI), DI
  1615  	JMP	di_finish
  1616  di_high:
  1617  	MOVQ	-8(DI)(R8*1), DI
  1618  	SHRQ	CX, DI
  1619  di_finish:
  1620  	SHLQ	CX, DI
  1621  
  1622  	BSWAPQ	SI	// reverse order of bytes
  1623  	BSWAPQ	DI
  1624  	XORQ	SI, DI	// find bit differences
  1625  	JEQ	allsame
  1626  	BSRQ	DI, CX	// index of highest bit difference
  1627  	SHRQ	CX, SI	// move a's bit to bottom
  1628  	ANDQ	$1, SI	// mask bit
  1629  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1630  	MOVQ	AX, (R9)
  1631  	RET
  1632  
  1633  allsame:
  1634  	XORQ	AX, AX
  1635  	XORQ	CX, CX
  1636  	CMPQ	BX, DX
  1637  	SETGT	AX	// 1 if alen > blen
  1638  	SETEQ	CX	// 1 if alen == blen
  1639  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1640  	MOVQ	AX, (R9)
  1641  	RET
  1642  
  1643  	// this works for >= 64 bytes of data.
  1644  big_loop:
  1645  	MOVOU	(SI), X0
  1646  	MOVOU	(DI), X1
  1647  	PCMPEQB X0, X1
  1648  	PMOVMSKB X1, AX
  1649  	XORQ	$0xffff, AX
  1650  	JNE	diff16
  1651  
  1652  	MOVOU	16(SI), X0
  1653  	MOVOU	16(DI), X1
  1654  	PCMPEQB X0, X1
  1655  	PMOVMSKB X1, AX
  1656  	XORQ	$0xffff, AX
  1657  	JNE	diff32
  1658  
  1659  	MOVOU	32(SI), X0
  1660  	MOVOU	32(DI), X1
  1661  	PCMPEQB X0, X1
  1662  	PMOVMSKB X1, AX
  1663  	XORQ	$0xffff, AX
  1664  	JNE	diff48
  1665  
  1666  	MOVOU	48(SI), X0
  1667  	MOVOU	48(DI), X1
  1668  	PCMPEQB X0, X1
  1669  	PMOVMSKB X1, AX
  1670  	XORQ	$0xffff, AX
  1671  	JNE	diff64
  1672  
  1673  	ADDQ	$64, SI
  1674  	ADDQ	$64, DI
  1675  	SUBQ	$64, R8
  1676  	CMPQ	R8, $64
  1677  	JBE	loop
  1678  	JMP	big_loop
  1679  
  1680  	// Compare 64-bytes per loop iteration.
  1681  	// Loop is unrolled and uses AVX2.
  1682  big_loop_avx2:
  1683  	VMOVDQU	(SI), Y2
  1684  	VMOVDQU	(DI), Y3
  1685  	VMOVDQU	32(SI), Y4
  1686  	VMOVDQU	32(DI), Y5
  1687  	VPCMPEQB Y2, Y3, Y0
  1688  	VPMOVMSKB Y0, AX
  1689  	XORL	$0xffffffff, AX
  1690  	JNE	diff32_avx2
  1691  	VPCMPEQB Y4, Y5, Y6
  1692  	VPMOVMSKB Y6, AX
  1693  	XORL	$0xffffffff, AX
  1694  	JNE	diff64_avx2
  1695  
  1696  	ADDQ	$64, SI
  1697  	ADDQ	$64, DI
  1698  	SUBQ	$64, R8
  1699  	CMPQ	R8, $64
  1700  	JB	big_loop_avx2_exit
  1701  	JMP	big_loop_avx2
  1702  
  1703  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1704  diff32_avx2:
  1705  	VZEROUPPER
  1706  	JMP diff16
  1707  
  1708  	// Same as diff32_avx2, but for last 32 bytes.
  1709  diff64_avx2:
  1710  	VZEROUPPER
  1711  	JMP diff48
  1712  
  1713  	// For <64 bytes remainder jump to normal loop.
  1714  big_loop_avx2_exit:
  1715  	VZEROUPPER
  1716  	JMP loop
  1717  
  1718  
  1719  TEXT strings·supportAVX2(SB),NOSPLIT,$0-1
  1720  	MOVBLZX runtime·support_avx2(SB), AX
  1721  	MOVB AX, ret+0(FP)
  1722  	RET
  1723  
  1724  TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1
  1725  	MOVBLZX runtime·support_avx2(SB), AX
  1726  	MOVB AX, ret+0(FP)
  1727  	RET
  1728  
  1729  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1730  	MOVQ s+0(FP), DI
  1731  	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1732  	MOVQ s_len+8(FP), DX
  1733  	MOVQ c+16(FP), BP
  1734  	MOVQ c_len+24(FP), AX
  1735  	MOVQ DI, R10
  1736  	LEAQ ret+32(FP), R11
  1737  	JMP  runtime·indexShortStr(SB)
  1738  
  1739  TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
  1740  	MOVQ s+0(FP), DI
  1741  	MOVQ s_len+8(FP), DX
  1742  	MOVQ c+24(FP), BP
  1743  	MOVQ c_len+32(FP), AX
  1744  	MOVQ DI, R10
  1745  	LEAQ ret+48(FP), R11
  1746  	JMP  runtime·indexShortStr(SB)
  1747  
  1748  // AX: length of string, that we are searching for
  1749  // DX: length of string, in which we are searching
  1750  // DI: pointer to string, in which we are searching
  1751  // BP: pointer to string, that we are searching for
  1752  // R11: address, where to put return value
  1753  TEXT runtime·indexShortStr(SB),NOSPLIT,$0
  1754  	CMPQ AX, DX
  1755  	JA fail
  1756  	CMPQ DX, $16
  1757  	JAE sse42
  1758  no_sse42:
  1759  	CMPQ AX, $2
  1760  	JA   _3_or_more
  1761  	MOVW (BP), BP
  1762  	LEAQ -1(DI)(DX*1), DX
  1763  loop2:
  1764  	MOVW (DI), SI
  1765  	CMPW SI,BP
  1766  	JZ success
  1767  	ADDQ $1,DI
  1768  	CMPQ DI,DX
  1769  	JB loop2
  1770  	JMP fail
  1771  _3_or_more:
  1772  	CMPQ AX, $3
  1773  	JA   _4_or_more
  1774  	MOVW 1(BP), BX
  1775  	MOVW (BP), BP
  1776  	LEAQ -2(DI)(DX*1), DX
  1777  loop3:
  1778  	MOVW (DI), SI
  1779  	CMPW SI,BP
  1780  	JZ   partial_success3
  1781  	ADDQ $1,DI
  1782  	CMPQ DI,DX
  1783  	JB loop3
  1784  	JMP fail
  1785  partial_success3:
  1786  	MOVW 1(DI), SI
  1787  	CMPW SI,BX
  1788  	JZ success
  1789  	ADDQ $1,DI
  1790  	CMPQ DI,DX
  1791  	JB loop3
  1792  	JMP fail
  1793  _4_or_more:
  1794  	CMPQ AX, $4
  1795  	JA   _5_or_more
  1796  	MOVL (BP), BP
  1797  	LEAQ -3(DI)(DX*1), DX
  1798  loop4:
  1799  	MOVL (DI), SI
  1800  	CMPL SI,BP
  1801  	JZ   success
  1802  	ADDQ $1,DI
  1803  	CMPQ DI,DX
  1804  	JB loop4
  1805  	JMP fail
  1806  _5_or_more:
  1807  	CMPQ AX, $7
  1808  	JA   _8_or_more
  1809  	LEAQ 1(DI)(DX*1), DX
  1810  	SUBQ AX, DX
  1811  	MOVL -4(BP)(AX*1), BX
  1812  	MOVL (BP), BP
  1813  loop5to7:
  1814  	MOVL (DI), SI
  1815  	CMPL SI,BP
  1816  	JZ   partial_success5to7
  1817  	ADDQ $1,DI
  1818  	CMPQ DI,DX
  1819  	JB loop5to7
  1820  	JMP fail
  1821  partial_success5to7:
  1822  	MOVL -4(AX)(DI*1), SI
  1823  	CMPL SI,BX
  1824  	JZ success
  1825  	ADDQ $1,DI
  1826  	CMPQ DI,DX
  1827  	JB loop5to7
  1828  	JMP fail
  1829  _8_or_more:
  1830  	CMPQ AX, $8
  1831  	JA   _9_or_more
  1832  	MOVQ (BP), BP
  1833  	LEAQ -7(DI)(DX*1), DX
  1834  loop8:
  1835  	MOVQ (DI), SI
  1836  	CMPQ SI,BP
  1837  	JZ   success
  1838  	ADDQ $1,DI
  1839  	CMPQ DI,DX
  1840  	JB loop8
  1841  	JMP fail
  1842  _9_or_more:
  1843  	CMPQ AX, $15
  1844  	JA   _16_or_more
  1845  	LEAQ 1(DI)(DX*1), DX
  1846  	SUBQ AX, DX
  1847  	MOVQ -8(BP)(AX*1), BX
  1848  	MOVQ (BP), BP
  1849  loop9to15:
  1850  	MOVQ (DI), SI
  1851  	CMPQ SI,BP
  1852  	JZ   partial_success9to15
  1853  	ADDQ $1,DI
  1854  	CMPQ DI,DX
  1855  	JB loop9to15
  1856  	JMP fail
  1857  partial_success9to15:
  1858  	MOVQ -8(AX)(DI*1), SI
  1859  	CMPQ SI,BX
  1860  	JZ success
  1861  	ADDQ $1,DI
  1862  	CMPQ DI,DX
  1863  	JB loop9to15
  1864  	JMP fail
  1865  _16_or_more:
  1866  	CMPQ AX, $16
  1867  	JA   _17_or_more
  1868  	MOVOU (BP), X1
  1869  	LEAQ -15(DI)(DX*1), DX
  1870  loop16:
  1871  	MOVOU (DI), X2
  1872  	PCMPEQB X1, X2
  1873  	PMOVMSKB X2, SI
  1874  	CMPQ  SI, $0xffff
  1875  	JE   success
  1876  	ADDQ $1,DI
  1877  	CMPQ DI,DX
  1878  	JB loop16
  1879  	JMP fail
  1880  _17_or_more:
  1881  	CMPQ AX, $31
  1882  	JA   _32_or_more
  1883  	LEAQ 1(DI)(DX*1), DX
  1884  	SUBQ AX, DX
  1885  	MOVOU -16(BP)(AX*1), X0
  1886  	MOVOU (BP), X1
  1887  loop17to31:
  1888  	MOVOU (DI), X2
  1889  	PCMPEQB X1,X2
  1890  	PMOVMSKB X2, SI
  1891  	CMPQ  SI, $0xffff
  1892  	JE   partial_success17to31
  1893  	ADDQ $1,DI
  1894  	CMPQ DI,DX
  1895  	JB loop17to31
  1896  	JMP fail
  1897  partial_success17to31:
  1898  	MOVOU -16(AX)(DI*1), X3
  1899  	PCMPEQB X0, X3
  1900  	PMOVMSKB X3, SI
  1901  	CMPQ  SI, $0xffff
  1902  	JE success
  1903  	ADDQ $1,DI
  1904  	CMPQ DI,DX
  1905  	JB loop17to31
  1906  	JMP fail
  1907  // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
  1908  // So no need to check cpuid
  1909  _32_or_more:
  1910  	CMPQ AX, $32
  1911  	JA   _33_to_63
  1912  	VMOVDQU (BP), Y1
  1913  	LEAQ -31(DI)(DX*1), DX
  1914  loop32:
  1915  	VMOVDQU (DI), Y2
  1916  	VPCMPEQB Y1, Y2, Y3
  1917  	VPMOVMSKB Y3, SI
  1918  	CMPL  SI, $0xffffffff
  1919  	JE   success_avx2
  1920  	ADDQ $1,DI
  1921  	CMPQ DI,DX
  1922  	JB loop32
  1923  	JMP fail_avx2
  1924  _33_to_63:
  1925  	LEAQ 1(DI)(DX*1), DX
  1926  	SUBQ AX, DX
  1927  	VMOVDQU -32(BP)(AX*1), Y0
  1928  	VMOVDQU (BP), Y1
  1929  loop33to63:
  1930  	VMOVDQU (DI), Y2
  1931  	VPCMPEQB Y1, Y2, Y3
  1932  	VPMOVMSKB Y3, SI
  1933  	CMPL  SI, $0xffffffff
  1934  	JE   partial_success33to63
  1935  	ADDQ $1,DI
  1936  	CMPQ DI,DX
  1937  	JB loop33to63
  1938  	JMP fail_avx2
  1939  partial_success33to63:
  1940  	VMOVDQU -32(AX)(DI*1), Y3
  1941  	VPCMPEQB Y0, Y3, Y4
  1942  	VPMOVMSKB Y4, SI
  1943  	CMPL  SI, $0xffffffff
  1944  	JE success_avx2
  1945  	ADDQ $1,DI
  1946  	CMPQ DI,DX
  1947  	JB loop33to63
  1948  fail_avx2:
  1949  	VZEROUPPER
  1950  fail:
  1951  	MOVQ $-1, (R11)
  1952  	RET
  1953  success_avx2:
  1954  	VZEROUPPER
  1955  	JMP success
  1956  sse42:
  1957  	MOVL runtime·cpuid_ecx(SB), CX
  1958  	ANDL $0x100000, CX
  1959  	JZ no_sse42
  1960  	CMPQ AX, $12
  1961  	// PCMPESTRI is slower than normal compare,
  1962  	// so using it makes sense only if we advance 4+ bytes per compare
  1963  	// This value was determined experimentally and is the ~same
  1964  	// on Nehalem (first with SSE42) and Haswell.
  1965  	JAE _9_or_more
  1966  	LEAQ 16(BP), SI
  1967  	TESTW $0xff0, SI
  1968  	JEQ no_sse42
  1969  	MOVOU (BP), X1
  1970  	LEAQ -15(DI)(DX*1), SI
  1971  	MOVQ $16, R9
  1972  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1973  loop_sse42:
  1974  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1975  	// for equality (bits 2,3 are 11)
  1976  	// result is not masked or inverted (bits 4,5 are 00)
  1977  	// and corresponds to first matching byte (bit 6 is 0)
  1978  	PCMPESTRI $0x0c, (DI), X1
  1979  	// CX == 16 means no match,
  1980  	// CX > R9 means partial match at the end of the string,
  1981  	// otherwise sep is at offset CX from X1 start
  1982  	CMPQ CX, R9
  1983  	JBE sse42_success
  1984  	ADDQ R9, DI
  1985  	CMPQ DI, SI
  1986  	JB loop_sse42
  1987  	PCMPESTRI $0x0c, -1(SI), X1
  1988  	CMPQ CX, R9
  1989  	JA fail
  1990  	LEAQ -1(SI), DI
  1991  sse42_success:
  1992  	ADDQ CX, DI
  1993  success:
  1994  	SUBQ R10, DI
  1995  	MOVQ DI, (R11)
  1996  	RET
  1997  
  1998  
  1999  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  2000  	MOVQ s+0(FP), SI
  2001  	MOVQ s_len+8(FP), BX
  2002  	MOVB c+24(FP), AL
  2003  	LEAQ ret+32(FP), R8
  2004  	JMP  runtime·indexbytebody(SB)
  2005  
  2006  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  2007  	MOVQ s+0(FP), SI
  2008  	MOVQ s_len+8(FP), BX
  2009  	MOVB c+16(FP), AL
  2010  	LEAQ ret+24(FP), R8
  2011  	JMP  runtime·indexbytebody(SB)
  2012  
  2013  // input:
  2014  //   SI: data
  2015  //   BX: data len
  2016  //   AL: byte sought
  2017  //   R8: address to put result
  2018  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  2019  	// Shuffle X0 around so that each byte contains
  2020  	// the character we're looking for.
  2021  	MOVD AX, X0
  2022  	PUNPCKLBW X0, X0
  2023  	PUNPCKLBW X0, X0
  2024  	PSHUFL $0, X0, X0
  2025  	
  2026  	CMPQ BX, $16
  2027  	JLT small
  2028  
  2029  	MOVQ SI, DI
  2030  
  2031  	CMPQ BX, $32
  2032  	JA avx2
  2033  sse:
  2034  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2035  	JMP	sseloopentry
  2036  	
  2037  sseloop:
  2038  	// Move the next 16-byte chunk of the data into X1.
  2039  	MOVOU	(DI), X1
  2040  	// Compare bytes in X0 to X1.
  2041  	PCMPEQB	X0, X1
  2042  	// Take the top bit of each byte in X1 and put the result in DX.
  2043  	PMOVMSKB X1, DX
  2044  	// Find first set bit, if any.
  2045  	BSFL	DX, DX
  2046  	JNZ	ssesuccess
  2047  	// Advance to next block.
  2048  	ADDQ	$16, DI
  2049  sseloopentry:
  2050  	CMPQ	DI, AX
  2051  	JB	sseloop
  2052  
  2053  	// Search the last 16-byte chunk. This chunk may overlap with the
  2054  	// chunks we've already searched, but that's ok.
  2055  	MOVQ	AX, DI
  2056  	MOVOU	(AX), X1
  2057  	PCMPEQB	X0, X1
  2058  	PMOVMSKB X1, DX
  2059  	BSFL	DX, DX
  2060  	JNZ	ssesuccess
  2061  
  2062  failure:
  2063  	MOVQ $-1, (R8)
  2064  	RET
  2065  
  2066  // We've found a chunk containing the byte.
  2067  // The chunk was loaded from DI.
  2068  // The index of the matching byte in the chunk is DX.
  2069  // The start of the data is SI.
  2070  ssesuccess:
  2071  	SUBQ SI, DI	// Compute offset of chunk within data.
  2072  	ADDQ DX, DI	// Add offset of byte within chunk.
  2073  	MOVQ DI, (R8)
  2074  	RET
  2075  
  2076  // handle for lengths < 16
  2077  small:
  2078  	TESTQ	BX, BX
  2079  	JEQ	failure
  2080  
  2081  	// Check if we'll load across a page boundary.
  2082  	LEAQ	16(SI), AX
  2083  	TESTW	$0xff0, AX
  2084  	JEQ	endofpage
  2085  
  2086  	MOVOU	(SI), X1 // Load data
  2087  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2088  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2089  	BSFL	DX, DX	// Find first set bit.
  2090  	JZ	failure	// No set bit, failure.
  2091  	CMPL	DX, BX
  2092  	JAE	failure	// Match is past end of data.
  2093  	MOVQ	DX, (R8)
  2094  	RET
  2095  
  2096  endofpage:
  2097  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2098  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2099  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2100  	MOVL	BX, CX
  2101  	SHLL	CX, DX
  2102  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2103  	BSFL	DX, DX	// Find first set bit.
  2104  	JZ	failure	// No set bit, failure.
  2105  	MOVQ	DX, (R8)
  2106  	RET
  2107  
  2108  avx2:
  2109  	CMPB   runtime·support_avx2(SB), $1
  2110  	JNE sse
  2111  	MOVD AX, X0
  2112  	LEAQ -32(SI)(BX*1), R11
  2113  	VPBROADCASTB  X0, Y1
  2114  avx2_loop:
  2115  	VMOVDQU (DI), Y2
  2116  	VPCMPEQB Y1, Y2, Y3
  2117  	VPTEST Y3, Y3
  2118  	JNZ avx2success
  2119  	ADDQ $32, DI
  2120  	CMPQ DI, R11
  2121  	JLT avx2_loop
  2122  	MOVQ R11, DI
  2123  	VMOVDQU (DI), Y2
  2124  	VPCMPEQB Y1, Y2, Y3
  2125  	VPTEST Y3, Y3
  2126  	JNZ avx2success
  2127  	VZEROUPPER
  2128  	MOVQ $-1, (R8)
  2129  	RET
  2130  
  2131  avx2success:
  2132  	VPMOVMSKB Y3, DX
  2133  	BSFL DX, DX
  2134  	SUBQ SI, DI
  2135  	ADDQ DI, DX
  2136  	MOVQ DX, (R8)
  2137  	VZEROUPPER
  2138  	RET
  2139  
  2140  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2141  	MOVQ	a_len+8(FP), BX
  2142  	MOVQ	b_len+32(FP), CX
  2143  	CMPQ	BX, CX
  2144  	JNE	eqret
  2145  	MOVQ	a+0(FP), SI
  2146  	MOVQ	b+24(FP), DI
  2147  	LEAQ	ret+48(FP), AX
  2148  	JMP	runtime·memeqbody(SB)
  2149  eqret:
  2150  	MOVB	$0, ret+48(FP)
  2151  	RET
  2152  
  2153  TEXT runtime·fastrand(SB), NOSPLIT, $0-4
  2154  	get_tls(CX)
  2155  	MOVQ	g(CX), AX
  2156  	MOVQ	g_m(AX), AX
  2157  	MOVL	m_fastrand(AX), DX
  2158  	ADDL	DX, DX
  2159  	MOVL	DX, BX
  2160  	XORL	$0x88888eef, DX
  2161  	CMOVLMI	BX, DX
  2162  	MOVL	DX, m_fastrand(AX)
  2163  	MOVL	DX, ret+0(FP)
  2164  	RET
  2165  
  2166  TEXT runtime·return0(SB), NOSPLIT, $0
  2167  	MOVL	$0, AX
  2168  	RET
  2169  
  2170  
  2171  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2172  // Must obey the gcc calling convention.
  2173  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2174  	get_tls(CX)
  2175  	MOVQ	g(CX), AX
  2176  	MOVQ	g_m(AX), AX
  2177  	MOVQ	m_curg(AX), AX
  2178  	MOVQ	(g_stack+stack_hi)(AX), AX
  2179  	RET
  2180  
  2181  // The top-most function running on a goroutine
  2182  // returns to goexit+PCQuantum.
  2183  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2184  	BYTE	$0x90	// NOP
  2185  	CALL	runtime·goexit1(SB)	// does not return
  2186  	// traceback from goexit1 must hit code range of goexit
  2187  	BYTE	$0x90	// NOP
  2188  
  2189  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2190  	MOVQ	addr+0(FP), AX
  2191  	PREFETCHT0	(AX)
  2192  	RET
  2193  
  2194  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2195  	MOVQ	addr+0(FP), AX
  2196  	PREFETCHT1	(AX)
  2197  	RET
  2198  
  2199  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2200  	MOVQ	addr+0(FP), AX
  2201  	PREFETCHT2	(AX)
  2202  	RET
  2203  
  2204  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2205  	MOVQ	addr+0(FP), AX
  2206  	PREFETCHNTA	(AX)
  2207  	RET
  2208  
  2209  // This is called from .init_array and follows the platform, not Go, ABI.
  2210  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2211  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2212  	MOVQ	runtime·lastmoduledatap(SB), AX
  2213  	MOVQ	DI, moduledata_next(AX)
  2214  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2215  	POPQ	R15
  2216  	RET