github.com/tidwall/go@v0.0.0-20170415222209-6694a6888b7d/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	MOVQ	AX, SI
    32  	CMPQ	AX, $0
    33  	JE	nocpuinfo
    34  
    35  	// Figure out how to serialize RDTSC.
    36  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37  	// Don't know about the rest, so let's do MFENCE.
    38  	CMPL	BX, $0x756E6547  // "Genu"
    39  	JNE	notintel
    40  	CMPL	DX, $0x49656E69  // "ineI"
    41  	JNE	notintel
    42  	CMPL	CX, $0x6C65746E  // "ntel"
    43  	JNE	notintel
    44  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    45  notintel:
    46  
    47  	// Load EAX=1 cpuid flags
    48  	MOVQ	$1, AX
    49  	CPUID
    50  	MOVL	CX, runtime·cpuid_ecx(SB)
    51  	MOVL	DX, runtime·cpuid_edx(SB)
    52  
    53  	// Load EAX=7/ECX=0 cpuid flags
    54  	CMPQ	SI, $7
    55  	JLT	no7
    56  	MOVL	$7, AX
    57  	MOVL	$0, CX
    58  	CPUID
    59  	MOVL	BX, runtime·cpuid_ebx7(SB)
    60  no7:
    61  	// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
    62  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
    63  	// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
    64  	MOVL	runtime·cpuid_ecx(SB), CX
    65  	ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
    66  	CMPL    CX, $0x18000000
    67  	JNE     noavx
    68  	MOVL    $0, CX
    69  	// For XGETBV, OSXSAVE bit is required and sufficient
    70  	XGETBV
    71  	ANDL    $6, AX
    72  	CMPL    AX, $6 // Check for OS support of YMM registers
    73  	JNE     noavx
    74  	MOVB    $1, runtime·support_avx(SB)
    75  	TESTL   $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
    76  	JEQ     noavx2
    77  	MOVB    $1, runtime·support_avx2(SB)
    78  	JMP     testbmi1
    79  noavx:
    80  	MOVB    $0, runtime·support_avx(SB)
    81  noavx2:
    82  	MOVB    $0, runtime·support_avx2(SB)
    83  testbmi1:
    84  	// Detect BMI1 and BMI2 extensions as per
    85  	// 5.1.16.1 Detection of VEX-encoded GPR Instructions,
    86  	//   LZCNT and TZCNT, PREFETCHW chapter of [1]
    87  	MOVB    $0, runtime·support_bmi1(SB)
    88  	TESTL   $(1<<3), runtime·cpuid_ebx7(SB) // check for BMI1 bit
    89  	JEQ     testbmi2
    90  	MOVB    $1, runtime·support_bmi1(SB)
    91  testbmi2:
    92  	MOVB    $0, runtime·support_bmi2(SB)
    93  	TESTL   $(1<<8), runtime·cpuid_ebx7(SB) // check for BMI2 bit
    94  	JEQ     testpopcnt
    95  	MOVB    $1, runtime·support_bmi2(SB)
    96  testpopcnt:
    97  	MOVB	$0, runtime·support_popcnt(SB)
    98  	TESTL	$(1<<23), runtime·cpuid_ecx(SB) // check for POPCNT bit
    99  	JEQ     nocpuinfo
   100  	MOVB    $1, runtime·support_popcnt(SB)
   101  nocpuinfo:	
   102  	
   103  	// if there is an _cgo_init, call it.
   104  	MOVQ	_cgo_init(SB), AX
   105  	TESTQ	AX, AX
   106  	JZ	needtls
   107  	// g0 already in DI
   108  	MOVQ	DI, CX	// Win64 uses CX for first parameter
   109  	MOVQ	$setg_gcc<>(SB), SI
   110  	CALL	AX
   111  
   112  	// update stackguard after _cgo_init
   113  	MOVQ	$runtime·g0(SB), CX
   114  	MOVQ	(g_stack+stack_lo)(CX), AX
   115  	ADDQ	$const__StackGuard, AX
   116  	MOVQ	AX, g_stackguard0(CX)
   117  	MOVQ	AX, g_stackguard1(CX)
   118  
   119  #ifndef GOOS_windows
   120  	JMP ok
   121  #endif
   122  needtls:
   123  #ifdef GOOS_plan9
   124  	// skip TLS setup on Plan 9
   125  	JMP ok
   126  #endif
   127  #ifdef GOOS_solaris
   128  	// skip TLS setup on Solaris
   129  	JMP ok
   130  #endif
   131  
   132  	LEAQ	runtime·m0+m_tls(SB), DI
   133  	CALL	runtime·settls(SB)
   134  
   135  	// store through it, to make sure it works
   136  	get_tls(BX)
   137  	MOVQ	$0x123, g(BX)
   138  	MOVQ	runtime·m0+m_tls(SB), AX
   139  	CMPQ	AX, $0x123
   140  	JEQ 2(PC)
   141  	MOVL	AX, 0	// abort
   142  ok:
   143  	// set the per-goroutine and per-mach "registers"
   144  	get_tls(BX)
   145  	LEAQ	runtime·g0(SB), CX
   146  	MOVQ	CX, g(BX)
   147  	LEAQ	runtime·m0(SB), AX
   148  
   149  	// save m->g0 = g0
   150  	MOVQ	CX, m_g0(AX)
   151  	// save m0 to g0->m
   152  	MOVQ	AX, g_m(CX)
   153  
   154  	CLD				// convention is D is always left cleared
   155  	CALL	runtime·check(SB)
   156  
   157  	MOVL	16(SP), AX		// copy argc
   158  	MOVL	AX, 0(SP)
   159  	MOVQ	24(SP), AX		// copy argv
   160  	MOVQ	AX, 8(SP)
   161  	CALL	runtime·args(SB)
   162  	CALL	runtime·osinit(SB)
   163  	CALL	runtime·schedinit(SB)
   164  
   165  	// create a new goroutine to start program
   166  	MOVQ	$runtime·mainPC(SB), AX		// entry
   167  	PUSHQ	AX
   168  	PUSHQ	$0			// arg size
   169  	CALL	runtime·newproc(SB)
   170  	POPQ	AX
   171  	POPQ	AX
   172  
   173  	// start this M
   174  	CALL	runtime·mstart(SB)
   175  
   176  	MOVL	$0xf1, 0xf1  // crash
   177  	RET
   178  
   179  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   180  GLOBL	runtime·mainPC(SB),RODATA,$8
   181  
   182  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   183  	BYTE	$0xcc
   184  	RET
   185  
   186  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   187  	// No per-thread init.
   188  	RET
   189  
   190  /*
   191   *  go-routine
   192   */
   193  
   194  // void gosave(Gobuf*)
   195  // save state in Gobuf; setjmp
   196  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   197  	MOVQ	buf+0(FP), AX		// gobuf
   198  	LEAQ	buf+0(FP), BX		// caller's SP
   199  	MOVQ	BX, gobuf_sp(AX)
   200  	MOVQ	0(SP), BX		// caller's PC
   201  	MOVQ	BX, gobuf_pc(AX)
   202  	MOVQ	$0, gobuf_ret(AX)
   203  	MOVQ	BP, gobuf_bp(AX)
   204  	// Assert ctxt is zero. See func save.
   205  	MOVQ	gobuf_ctxt(AX), BX
   206  	TESTQ	BX, BX
   207  	JZ	2(PC)
   208  	CALL	runtime·badctxt(SB)
   209  	get_tls(CX)
   210  	MOVQ	g(CX), BX
   211  	MOVQ	BX, gobuf_g(AX)
   212  	RET
   213  
   214  // void gogo(Gobuf*)
   215  // restore state from Gobuf; longjmp
   216  TEXT runtime·gogo(SB), NOSPLIT, $16-8
   217  	MOVQ	buf+0(FP), BX		// gobuf
   218  
   219  	// If ctxt is not nil, invoke deletion barrier before overwriting.
   220  	MOVQ	gobuf_ctxt(BX), AX
   221  	TESTQ	AX, AX
   222  	JZ	nilctxt
   223  	LEAQ	gobuf_ctxt(BX), AX
   224  	MOVQ	AX, 0(SP)
   225  	MOVQ	$0, 8(SP)
   226  	CALL	runtime·writebarrierptr_prewrite(SB)
   227  	MOVQ	buf+0(FP), BX
   228  
   229  nilctxt:
   230  	MOVQ	gobuf_g(BX), DX
   231  	MOVQ	0(DX), CX		// make sure g != nil
   232  	get_tls(CX)
   233  	MOVQ	DX, g(CX)
   234  	MOVQ	gobuf_sp(BX), SP	// restore SP
   235  	MOVQ	gobuf_ret(BX), AX
   236  	MOVQ	gobuf_ctxt(BX), DX
   237  	MOVQ	gobuf_bp(BX), BP
   238  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   239  	MOVQ	$0, gobuf_ret(BX)
   240  	MOVQ	$0, gobuf_ctxt(BX)
   241  	MOVQ	$0, gobuf_bp(BX)
   242  	MOVQ	gobuf_pc(BX), BX
   243  	JMP	BX
   244  
   245  // func mcall(fn func(*g))
   246  // Switch to m->g0's stack, call fn(g).
   247  // Fn must never return. It should gogo(&g->sched)
   248  // to keep running g.
   249  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   250  	MOVQ	fn+0(FP), DI
   251  	
   252  	get_tls(CX)
   253  	MOVQ	g(CX), AX	// save state in g->sched
   254  	MOVQ	0(SP), BX	// caller's PC
   255  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   256  	LEAQ	fn+0(FP), BX	// caller's SP
   257  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   258  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   259  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   260  
   261  	// switch to m->g0 & its stack, call fn
   262  	MOVQ	g(CX), BX
   263  	MOVQ	g_m(BX), BX
   264  	MOVQ	m_g0(BX), SI
   265  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   266  	JNE	3(PC)
   267  	MOVQ	$runtime·badmcall(SB), AX
   268  	JMP	AX
   269  	MOVQ	SI, g(CX)	// g = m->g0
   270  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   271  	PUSHQ	AX
   272  	MOVQ	DI, DX
   273  	MOVQ	0(DI), DI
   274  	CALL	DI
   275  	POPQ	AX
   276  	MOVQ	$runtime·badmcall2(SB), AX
   277  	JMP	AX
   278  	RET
   279  
   280  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   281  // of the G stack. We need to distinguish the routine that
   282  // lives at the bottom of the G stack from the one that lives
   283  // at the top of the system stack because the one at the top of
   284  // the system stack terminates the stack walk (see topofstack()).
   285  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   286  	RET
   287  
   288  // func systemstack(fn func())
   289  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   290  	MOVQ	fn+0(FP), DI	// DI = fn
   291  	get_tls(CX)
   292  	MOVQ	g(CX), AX	// AX = g
   293  	MOVQ	g_m(AX), BX	// BX = m
   294  
   295  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   296  	CMPQ	AX, DX
   297  	JEQ	noswitch
   298  
   299  	MOVQ	m_g0(BX), DX	// DX = g0
   300  	CMPQ	AX, DX
   301  	JEQ	noswitch
   302  
   303  	MOVQ	m_curg(BX), R8
   304  	CMPQ	AX, R8
   305  	JEQ	switch
   306  	
   307  	// Bad: g is not gsignal, not g0, not curg. What is it?
   308  	MOVQ	$runtime·badsystemstack(SB), AX
   309  	CALL	AX
   310  
   311  switch:
   312  	// save our state in g->sched. Pretend to
   313  	// be systemstack_switch if the G stack is scanned.
   314  	MOVQ	$runtime·systemstack_switch(SB), SI
   315  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   316  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   317  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   318  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   319  
   320  	// switch to g0
   321  	MOVQ	DX, g(CX)
   322  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   323  	// make it look like mstart called systemstack on g0, to stop traceback
   324  	SUBQ	$8, BX
   325  	MOVQ	$runtime·mstart(SB), DX
   326  	MOVQ	DX, 0(BX)
   327  	MOVQ	BX, SP
   328  
   329  	// call target function
   330  	MOVQ	DI, DX
   331  	MOVQ	0(DI), DI
   332  	CALL	DI
   333  
   334  	// switch back to g
   335  	get_tls(CX)
   336  	MOVQ	g(CX), AX
   337  	MOVQ	g_m(AX), BX
   338  	MOVQ	m_curg(BX), AX
   339  	MOVQ	AX, g(CX)
   340  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   341  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   342  	RET
   343  
   344  noswitch:
   345  	// already on m stack, just call directly
   346  	MOVQ	DI, DX
   347  	MOVQ	0(DI), DI
   348  	CALL	DI
   349  	RET
   350  
   351  /*
   352   * support for morestack
   353   */
   354  
   355  // Called during function prolog when more stack is needed.
   356  //
   357  // The traceback routines see morestack on a g0 as being
   358  // the top of a stack (for example, morestack calling newstack
   359  // calling the scheduler calling newm calling gc), so we must
   360  // record an argument size. For that purpose, it has no arguments.
   361  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   362  	// Cannot grow scheduler stack (m->g0).
   363  	get_tls(CX)
   364  	MOVQ	g(CX), BX
   365  	MOVQ	g_m(BX), BX
   366  	MOVQ	m_g0(BX), SI
   367  	CMPQ	g(CX), SI
   368  	JNE	3(PC)
   369  	CALL	runtime·badmorestackg0(SB)
   370  	INT	$3
   371  
   372  	// Cannot grow signal stack (m->gsignal).
   373  	MOVQ	m_gsignal(BX), SI
   374  	CMPQ	g(CX), SI
   375  	JNE	3(PC)
   376  	CALL	runtime·badmorestackgsignal(SB)
   377  	INT	$3
   378  
   379  	// Called from f.
   380  	// Set m->morebuf to f's caller.
   381  	MOVQ	8(SP), AX	// f's caller's PC
   382  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   383  	LEAQ	16(SP), AX	// f's caller's SP
   384  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   385  	get_tls(CX)
   386  	MOVQ	g(CX), SI
   387  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   388  
   389  	// Set g->sched to context in f.
   390  	MOVQ	0(SP), AX // f's PC
   391  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   392  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   393  	LEAQ	8(SP), AX // f's SP
   394  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   395  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   396  	// newstack will fill gobuf.ctxt.
   397  
   398  	// Call newstack on m->g0's stack.
   399  	MOVQ	m_g0(BX), BX
   400  	MOVQ	BX, g(CX)
   401  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   402  	PUSHQ	DX	// ctxt argument
   403  	CALL	runtime·newstack(SB)
   404  	MOVQ	$0, 0x1003	// crash if newstack returns
   405  	POPQ	DX	// keep balance check happy
   406  	RET
   407  
   408  // morestack but not preserving ctxt.
   409  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   410  	MOVL	$0, DX
   411  	JMP	runtime·morestack(SB)
   412  
   413  // reflectcall: call a function with the given argument list
   414  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   415  // we don't have variable-sized frames, so we use a small number
   416  // of constant-sized-frame functions to encode a few bits of size in the pc.
   417  // Caution: ugly multiline assembly macros in your future!
   418  
   419  #define DISPATCH(NAME,MAXSIZE)		\
   420  	CMPQ	CX, $MAXSIZE;		\
   421  	JA	3(PC);			\
   422  	MOVQ	$NAME(SB), AX;		\
   423  	JMP	AX
   424  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   425  
   426  TEXT reflect·call(SB), NOSPLIT, $0-0
   427  	JMP	·reflectcall(SB)
   428  
   429  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   430  	MOVLQZX argsize+24(FP), CX
   431  	DISPATCH(runtime·call32, 32)
   432  	DISPATCH(runtime·call64, 64)
   433  	DISPATCH(runtime·call128, 128)
   434  	DISPATCH(runtime·call256, 256)
   435  	DISPATCH(runtime·call512, 512)
   436  	DISPATCH(runtime·call1024, 1024)
   437  	DISPATCH(runtime·call2048, 2048)
   438  	DISPATCH(runtime·call4096, 4096)
   439  	DISPATCH(runtime·call8192, 8192)
   440  	DISPATCH(runtime·call16384, 16384)
   441  	DISPATCH(runtime·call32768, 32768)
   442  	DISPATCH(runtime·call65536, 65536)
   443  	DISPATCH(runtime·call131072, 131072)
   444  	DISPATCH(runtime·call262144, 262144)
   445  	DISPATCH(runtime·call524288, 524288)
   446  	DISPATCH(runtime·call1048576, 1048576)
   447  	DISPATCH(runtime·call2097152, 2097152)
   448  	DISPATCH(runtime·call4194304, 4194304)
   449  	DISPATCH(runtime·call8388608, 8388608)
   450  	DISPATCH(runtime·call16777216, 16777216)
   451  	DISPATCH(runtime·call33554432, 33554432)
   452  	DISPATCH(runtime·call67108864, 67108864)
   453  	DISPATCH(runtime·call134217728, 134217728)
   454  	DISPATCH(runtime·call268435456, 268435456)
   455  	DISPATCH(runtime·call536870912, 536870912)
   456  	DISPATCH(runtime·call1073741824, 1073741824)
   457  	MOVQ	$runtime·badreflectcall(SB), AX
   458  	JMP	AX
   459  
   460  #define CALLFN(NAME,MAXSIZE)			\
   461  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   462  	NO_LOCAL_POINTERS;			\
   463  	/* copy arguments to stack */		\
   464  	MOVQ	argptr+16(FP), SI;		\
   465  	MOVLQZX argsize+24(FP), CX;		\
   466  	MOVQ	SP, DI;				\
   467  	REP;MOVSB;				\
   468  	/* call function */			\
   469  	MOVQ	f+8(FP), DX;			\
   470  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   471  	CALL	(DX);				\
   472  	/* copy return values back */		\
   473  	MOVQ	argtype+0(FP), DX;		\
   474  	MOVQ	argptr+16(FP), DI;		\
   475  	MOVLQZX	argsize+24(FP), CX;		\
   476  	MOVLQZX	retoffset+28(FP), BX;		\
   477  	MOVQ	SP, SI;				\
   478  	ADDQ	BX, DI;				\
   479  	ADDQ	BX, SI;				\
   480  	SUBQ	BX, CX;				\
   481  	CALL	callRet<>(SB);			\
   482  	RET
   483  
   484  // callRet copies return values back at the end of call*. This is a
   485  // separate function so it can allocate stack space for the arguments
   486  // to reflectcallmove. It does not follow the Go ABI; it expects its
   487  // arguments in registers.
   488  TEXT callRet<>(SB), NOSPLIT, $32-0
   489  	NO_LOCAL_POINTERS
   490  	MOVQ	DX, 0(SP)
   491  	MOVQ	DI, 8(SP)
   492  	MOVQ	SI, 16(SP)
   493  	MOVQ	CX, 24(SP)
   494  	CALL	runtime·reflectcallmove(SB)
   495  	RET
   496  
   497  CALLFN(·call32, 32)
   498  CALLFN(·call64, 64)
   499  CALLFN(·call128, 128)
   500  CALLFN(·call256, 256)
   501  CALLFN(·call512, 512)
   502  CALLFN(·call1024, 1024)
   503  CALLFN(·call2048, 2048)
   504  CALLFN(·call4096, 4096)
   505  CALLFN(·call8192, 8192)
   506  CALLFN(·call16384, 16384)
   507  CALLFN(·call32768, 32768)
   508  CALLFN(·call65536, 65536)
   509  CALLFN(·call131072, 131072)
   510  CALLFN(·call262144, 262144)
   511  CALLFN(·call524288, 524288)
   512  CALLFN(·call1048576, 1048576)
   513  CALLFN(·call2097152, 2097152)
   514  CALLFN(·call4194304, 4194304)
   515  CALLFN(·call8388608, 8388608)
   516  CALLFN(·call16777216, 16777216)
   517  CALLFN(·call33554432, 33554432)
   518  CALLFN(·call67108864, 67108864)
   519  CALLFN(·call134217728, 134217728)
   520  CALLFN(·call268435456, 268435456)
   521  CALLFN(·call536870912, 536870912)
   522  CALLFN(·call1073741824, 1073741824)
   523  
   524  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   525  	MOVL	cycles+0(FP), AX
   526  again:
   527  	PAUSE
   528  	SUBL	$1, AX
   529  	JNZ	again
   530  	RET
   531  
   532  
   533  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   534  	// Stores are already ordered on x86, so this is just a
   535  	// compile barrier.
   536  	RET
   537  
   538  // void jmpdefer(fn, sp);
   539  // called from deferreturn.
   540  // 1. pop the caller
   541  // 2. sub 5 bytes from the callers return
   542  // 3. jmp to the argument
   543  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   544  	MOVQ	fv+0(FP), DX	// fn
   545  	MOVQ	argp+8(FP), BX	// caller sp
   546  	LEAQ	-8(BX), SP	// caller sp after CALL
   547  	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   548  	SUBQ	$5, (SP)	// return to CALL again
   549  	MOVQ	0(DX), BX
   550  	JMP	BX	// but first run the deferred function
   551  
   552  // Save state of caller into g->sched. Smashes R8, R9.
   553  TEXT gosave<>(SB),NOSPLIT,$0
   554  	get_tls(R8)
   555  	MOVQ	g(R8), R8
   556  	MOVQ	0(SP), R9
   557  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   558  	LEAQ	8(SP), R9
   559  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   560  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   561  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   562  	// Assert ctxt is zero. See func save.
   563  	MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   564  	TESTQ	R9, R9
   565  	JZ	2(PC)
   566  	CALL	runtime·badctxt(SB)
   567  	RET
   568  
   569  // func asmcgocall(fn, arg unsafe.Pointer) int32
   570  // Call fn(arg) on the scheduler stack,
   571  // aligned appropriately for the gcc ABI.
   572  // See cgocall.go for more details.
   573  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   574  	MOVQ	fn+0(FP), AX
   575  	MOVQ	arg+8(FP), BX
   576  
   577  	MOVQ	SP, DX
   578  
   579  	// Figure out if we need to switch to m->g0 stack.
   580  	// We get called to create new OS threads too, and those
   581  	// come in on the m->g0 stack already.
   582  	get_tls(CX)
   583  	MOVQ	g(CX), R8
   584  	CMPQ	R8, $0
   585  	JEQ	nosave
   586  	MOVQ	g_m(R8), R8
   587  	MOVQ	m_g0(R8), SI
   588  	MOVQ	g(CX), DI
   589  	CMPQ	SI, DI
   590  	JEQ	nosave
   591  	MOVQ	m_gsignal(R8), SI
   592  	CMPQ	SI, DI
   593  	JEQ	nosave
   594  	
   595  	// Switch to system stack.
   596  	MOVQ	m_g0(R8), SI
   597  	CALL	gosave<>(SB)
   598  	MOVQ	SI, g(CX)
   599  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   600  
   601  	// Now on a scheduling stack (a pthread-created stack).
   602  	// Make sure we have enough room for 4 stack-backed fast-call
   603  	// registers as per windows amd64 calling convention.
   604  	SUBQ	$64, SP
   605  	ANDQ	$~15, SP	// alignment for gcc ABI
   606  	MOVQ	DI, 48(SP)	// save g
   607  	MOVQ	(g_stack+stack_hi)(DI), DI
   608  	SUBQ	DX, DI
   609  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   610  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   611  	MOVQ	BX, CX		// CX = first argument in Win64
   612  	CALL	AX
   613  
   614  	// Restore registers, g, stack pointer.
   615  	get_tls(CX)
   616  	MOVQ	48(SP), DI
   617  	MOVQ	(g_stack+stack_hi)(DI), SI
   618  	SUBQ	40(SP), SI
   619  	MOVQ	DI, g(CX)
   620  	MOVQ	SI, SP
   621  
   622  	MOVL	AX, ret+16(FP)
   623  	RET
   624  
   625  nosave:
   626  	// Running on a system stack, perhaps even without a g.
   627  	// Having no g can happen during thread creation or thread teardown
   628  	// (see needm/dropm on Solaris, for example).
   629  	// This code is like the above sequence but without saving/restoring g
   630  	// and without worrying about the stack moving out from under us
   631  	// (because we're on a system stack, not a goroutine stack).
   632  	// The above code could be used directly if already on a system stack,
   633  	// but then the only path through this code would be a rare case on Solaris.
   634  	// Using this code for all "already on system stack" calls exercises it more,
   635  	// which should help keep it correct.
   636  	SUBQ	$64, SP
   637  	ANDQ	$~15, SP
   638  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   639  	MOVQ	DX, 40(SP)	// save original stack pointer
   640  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   641  	MOVQ	BX, CX		// CX = first argument in Win64
   642  	CALL	AX
   643  	MOVQ	40(SP), SI	// restore original stack pointer
   644  	MOVQ	SI, SP
   645  	MOVL	AX, ret+16(FP)
   646  	RET
   647  
   648  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   649  // Turn the fn into a Go func (by taking its address) and call
   650  // cgocallback_gofunc.
   651  TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   652  	LEAQ	fn+0(FP), AX
   653  	MOVQ	AX, 0(SP)
   654  	MOVQ	frame+8(FP), AX
   655  	MOVQ	AX, 8(SP)
   656  	MOVQ	framesize+16(FP), AX
   657  	MOVQ	AX, 16(SP)
   658  	MOVQ	ctxt+24(FP), AX
   659  	MOVQ	AX, 24(SP)
   660  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   661  	CALL	AX
   662  	RET
   663  
   664  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   665  // See cgocall.go for more details.
   666  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   667  	NO_LOCAL_POINTERS
   668  
   669  	// If g is nil, Go did not create the current thread.
   670  	// Call needm to obtain one m for temporary use.
   671  	// In this case, we're running on the thread stack, so there's
   672  	// lots of space, but the linker doesn't know. Hide the call from
   673  	// the linker analysis by using an indirect call through AX.
   674  	get_tls(CX)
   675  #ifdef GOOS_windows
   676  	MOVL	$0, BX
   677  	CMPQ	CX, $0
   678  	JEQ	2(PC)
   679  #endif
   680  	MOVQ	g(CX), BX
   681  	CMPQ	BX, $0
   682  	JEQ	needm
   683  	MOVQ	g_m(BX), BX
   684  	MOVQ	BX, R8 // holds oldm until end of function
   685  	JMP	havem
   686  needm:
   687  	MOVQ	$0, 0(SP)
   688  	MOVQ	$runtime·needm(SB), AX
   689  	CALL	AX
   690  	MOVQ	0(SP), R8
   691  	get_tls(CX)
   692  	MOVQ	g(CX), BX
   693  	MOVQ	g_m(BX), BX
   694  	
   695  	// Set m->sched.sp = SP, so that if a panic happens
   696  	// during the function we are about to execute, it will
   697  	// have a valid SP to run on the g0 stack.
   698  	// The next few lines (after the havem label)
   699  	// will save this SP onto the stack and then write
   700  	// the same SP back to m->sched.sp. That seems redundant,
   701  	// but if an unrecovered panic happens, unwindm will
   702  	// restore the g->sched.sp from the stack location
   703  	// and then systemstack will try to use it. If we don't set it here,
   704  	// that restored SP will be uninitialized (typically 0) and
   705  	// will not be usable.
   706  	MOVQ	m_g0(BX), SI
   707  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   708  
   709  havem:
   710  	// Now there's a valid m, and we're running on its m->g0.
   711  	// Save current m->g0->sched.sp on stack and then set it to SP.
   712  	// Save current sp in m->g0->sched.sp in preparation for
   713  	// switch back to m->curg stack.
   714  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   715  	MOVQ	m_g0(BX), SI
   716  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   717  	MOVQ	AX, 0(SP)
   718  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   719  
   720  	// Switch to m->curg stack and call runtime.cgocallbackg.
   721  	// Because we are taking over the execution of m->curg
   722  	// but *not* resuming what had been running, we need to
   723  	// save that information (m->curg->sched) so we can restore it.
   724  	// We can restore m->curg->sched.sp easily, because calling
   725  	// runtime.cgocallbackg leaves SP unchanged upon return.
   726  	// To save m->curg->sched.pc, we push it onto the stack.
   727  	// This has the added benefit that it looks to the traceback
   728  	// routine like cgocallbackg is going to return to that
   729  	// PC (because the frame we allocate below has the same
   730  	// size as cgocallback_gofunc's frame declared above)
   731  	// so that the traceback will seamlessly trace back into
   732  	// the earlier calls.
   733  	//
   734  	// In the new goroutine, 8(SP) holds the saved R8.
   735  	MOVQ	m_curg(BX), SI
   736  	MOVQ	SI, g(CX)
   737  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   738  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   739  	MOVQ	BX, -8(DI)
   740  	// Compute the size of the frame, including return PC and, if
   741  	// GOEXPERIMENT=framepointer, the saved base pointer
   742  	MOVQ	ctxt+24(FP), BX
   743  	LEAQ	fv+0(FP), AX
   744  	SUBQ	SP, AX
   745  	SUBQ	AX, DI
   746  	MOVQ	DI, SP
   747  
   748  	MOVQ	R8, 8(SP)
   749  	MOVQ	BX, 0(SP)
   750  	CALL	runtime·cgocallbackg(SB)
   751  	MOVQ	8(SP), R8
   752  
   753  	// Compute the size of the frame again. FP and SP have
   754  	// completely different values here than they did above,
   755  	// but only their difference matters.
   756  	LEAQ	fv+0(FP), AX
   757  	SUBQ	SP, AX
   758  
   759  	// Restore g->sched (== m->curg->sched) from saved values.
   760  	get_tls(CX)
   761  	MOVQ	g(CX), SI
   762  	MOVQ	SP, DI
   763  	ADDQ	AX, DI
   764  	MOVQ	-8(DI), BX
   765  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   766  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   767  
   768  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   769  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   770  	// so we do not have to restore it.)
   771  	MOVQ	g(CX), BX
   772  	MOVQ	g_m(BX), BX
   773  	MOVQ	m_g0(BX), SI
   774  	MOVQ	SI, g(CX)
   775  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   776  	MOVQ	0(SP), AX
   777  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   778  	
   779  	// If the m on entry was nil, we called needm above to borrow an m
   780  	// for the duration of the call. Since the call is over, return it with dropm.
   781  	CMPQ	R8, $0
   782  	JNE 3(PC)
   783  	MOVQ	$runtime·dropm(SB), AX
   784  	CALL	AX
   785  
   786  	// Done!
   787  	RET
   788  
   789  // void setg(G*); set g. for use by needm.
   790  TEXT runtime·setg(SB), NOSPLIT, $0-8
   791  	MOVQ	gg+0(FP), BX
   792  #ifdef GOOS_windows
   793  	CMPQ	BX, $0
   794  	JNE	settls
   795  	MOVQ	$0, 0x28(GS)
   796  	RET
   797  settls:
   798  	MOVQ	g_m(BX), AX
   799  	LEAQ	m_tls(AX), AX
   800  	MOVQ	AX, 0x28(GS)
   801  #endif
   802  	get_tls(CX)
   803  	MOVQ	BX, g(CX)
   804  	RET
   805  
   806  // void setg_gcc(G*); set g called from gcc.
   807  TEXT setg_gcc<>(SB),NOSPLIT,$0
   808  	get_tls(AX)
   809  	MOVQ	DI, g(AX)
   810  	RET
   811  
   812  // check that SP is in range [g->stack.lo, g->stack.hi)
   813  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   814  	get_tls(CX)
   815  	MOVQ	g(CX), AX
   816  	CMPQ	(g_stack+stack_hi)(AX), SP
   817  	JHI	2(PC)
   818  	INT	$3
   819  	CMPQ	SP, (g_stack+stack_lo)(AX)
   820  	JHI	2(PC)
   821  	INT	$3
   822  	RET
   823  
   824  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   825  	MOVQ	argp+0(FP),AX		// addr of first arg
   826  	MOVQ	-8(AX),AX		// get calling pc
   827  	MOVQ	AX, ret+8(FP)
   828  	RET
   829  
   830  // func cputicks() int64
   831  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   832  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   833  	JNE	mfence
   834  	LFENCE
   835  	JMP	done
   836  mfence:
   837  	MFENCE
   838  done:
   839  	RDTSC
   840  	SHLQ	$32, DX
   841  	ADDQ	DX, AX
   842  	MOVQ	AX, ret+0(FP)
   843  	RET
   844  
   845  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   846  // redirects to memhash(p, h, size) using the size
   847  // stored in the closure.
   848  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   849  	GO_ARGS
   850  	NO_LOCAL_POINTERS
   851  	MOVQ	p+0(FP), AX
   852  	MOVQ	h+8(FP), BX
   853  	MOVQ	8(DX), CX
   854  	MOVQ	AX, 0(SP)
   855  	MOVQ	BX, 8(SP)
   856  	MOVQ	CX, 16(SP)
   857  	CALL	runtime·memhash(SB)
   858  	MOVQ	24(SP), AX
   859  	MOVQ	AX, ret+16(FP)
   860  	RET
   861  
   862  // hash function using AES hardware instructions
   863  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   864  	MOVQ	p+0(FP), AX	// ptr to data
   865  	MOVQ	s+16(FP), CX	// size
   866  	LEAQ	ret+24(FP), DX
   867  	JMP	runtime·aeshashbody(SB)
   868  
   869  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   870  	MOVQ	p+0(FP), AX	// ptr to string struct
   871  	MOVQ	8(AX), CX	// length of string
   872  	MOVQ	(AX), AX	// string data
   873  	LEAQ	ret+16(FP), DX
   874  	JMP	runtime·aeshashbody(SB)
   875  
   876  // AX: data
   877  // CX: length
   878  // DX: address to put return value
   879  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   880  	// Fill an SSE register with our seeds.
   881  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   882  	PINSRW	$4, CX, X0			// 16 bits of length
   883  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   884  	MOVO	X0, X1				// save unscrambled seed
   885  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   886  	AESENC	X0, X0				// scramble seed
   887  
   888  	CMPQ	CX, $16
   889  	JB	aes0to15
   890  	JE	aes16
   891  	CMPQ	CX, $32
   892  	JBE	aes17to32
   893  	CMPQ	CX, $64
   894  	JBE	aes33to64
   895  	CMPQ	CX, $128
   896  	JBE	aes65to128
   897  	JMP	aes129plus
   898  
   899  aes0to15:
   900  	TESTQ	CX, CX
   901  	JE	aes0
   902  
   903  	ADDQ	$16, AX
   904  	TESTW	$0xff0, AX
   905  	JE	endofpage
   906  
   907  	// 16 bytes loaded at this address won't cross
   908  	// a page boundary, so we can load it directly.
   909  	MOVOU	-16(AX), X1
   910  	ADDQ	CX, CX
   911  	MOVQ	$masks<>(SB), AX
   912  	PAND	(AX)(CX*8), X1
   913  final1:
   914  	PXOR	X0, X1	// xor data with seed
   915  	AESENC	X1, X1	// scramble combo 3 times
   916  	AESENC	X1, X1
   917  	AESENC	X1, X1
   918  	MOVQ	X1, (DX)
   919  	RET
   920  
   921  endofpage:
   922  	// address ends in 1111xxxx. Might be up against
   923  	// a page boundary, so load ending at last byte.
   924  	// Then shift bytes down using pshufb.
   925  	MOVOU	-32(AX)(CX*1), X1
   926  	ADDQ	CX, CX
   927  	MOVQ	$shifts<>(SB), AX
   928  	PSHUFB	(AX)(CX*8), X1
   929  	JMP	final1
   930  
   931  aes0:
   932  	// Return scrambled input seed
   933  	AESENC	X0, X0
   934  	MOVQ	X0, (DX)
   935  	RET
   936  
   937  aes16:
   938  	MOVOU	(AX), X1
   939  	JMP	final1
   940  
   941  aes17to32:
   942  	// make second starting seed
   943  	PXOR	runtime·aeskeysched+16(SB), X1
   944  	AESENC	X1, X1
   945  	
   946  	// load data to be hashed
   947  	MOVOU	(AX), X2
   948  	MOVOU	-16(AX)(CX*1), X3
   949  
   950  	// xor with seed
   951  	PXOR	X0, X2
   952  	PXOR	X1, X3
   953  
   954  	// scramble 3 times
   955  	AESENC	X2, X2
   956  	AESENC	X3, X3
   957  	AESENC	X2, X2
   958  	AESENC	X3, X3
   959  	AESENC	X2, X2
   960  	AESENC	X3, X3
   961  
   962  	// combine results
   963  	PXOR	X3, X2
   964  	MOVQ	X2, (DX)
   965  	RET
   966  
   967  aes33to64:
   968  	// make 3 more starting seeds
   969  	MOVO	X1, X2
   970  	MOVO	X1, X3
   971  	PXOR	runtime·aeskeysched+16(SB), X1
   972  	PXOR	runtime·aeskeysched+32(SB), X2
   973  	PXOR	runtime·aeskeysched+48(SB), X3
   974  	AESENC	X1, X1
   975  	AESENC	X2, X2
   976  	AESENC	X3, X3
   977  	
   978  	MOVOU	(AX), X4
   979  	MOVOU	16(AX), X5
   980  	MOVOU	-32(AX)(CX*1), X6
   981  	MOVOU	-16(AX)(CX*1), X7
   982  
   983  	PXOR	X0, X4
   984  	PXOR	X1, X5
   985  	PXOR	X2, X6
   986  	PXOR	X3, X7
   987  	
   988  	AESENC	X4, X4
   989  	AESENC	X5, X5
   990  	AESENC	X6, X6
   991  	AESENC	X7, X7
   992  	
   993  	AESENC	X4, X4
   994  	AESENC	X5, X5
   995  	AESENC	X6, X6
   996  	AESENC	X7, X7
   997  	
   998  	AESENC	X4, X4
   999  	AESENC	X5, X5
  1000  	AESENC	X6, X6
  1001  	AESENC	X7, X7
  1002  
  1003  	PXOR	X6, X4
  1004  	PXOR	X7, X5
  1005  	PXOR	X5, X4
  1006  	MOVQ	X4, (DX)
  1007  	RET
  1008  
  1009  aes65to128:
  1010  	// make 7 more starting seeds
  1011  	MOVO	X1, X2
  1012  	MOVO	X1, X3
  1013  	MOVO	X1, X4
  1014  	MOVO	X1, X5
  1015  	MOVO	X1, X6
  1016  	MOVO	X1, X7
  1017  	PXOR	runtime·aeskeysched+16(SB), X1
  1018  	PXOR	runtime·aeskeysched+32(SB), X2
  1019  	PXOR	runtime·aeskeysched+48(SB), X3
  1020  	PXOR	runtime·aeskeysched+64(SB), X4
  1021  	PXOR	runtime·aeskeysched+80(SB), X5
  1022  	PXOR	runtime·aeskeysched+96(SB), X6
  1023  	PXOR	runtime·aeskeysched+112(SB), X7
  1024  	AESENC	X1, X1
  1025  	AESENC	X2, X2
  1026  	AESENC	X3, X3
  1027  	AESENC	X4, X4
  1028  	AESENC	X5, X5
  1029  	AESENC	X6, X6
  1030  	AESENC	X7, X7
  1031  
  1032  	// load data
  1033  	MOVOU	(AX), X8
  1034  	MOVOU	16(AX), X9
  1035  	MOVOU	32(AX), X10
  1036  	MOVOU	48(AX), X11
  1037  	MOVOU	-64(AX)(CX*1), X12
  1038  	MOVOU	-48(AX)(CX*1), X13
  1039  	MOVOU	-32(AX)(CX*1), X14
  1040  	MOVOU	-16(AX)(CX*1), X15
  1041  
  1042  	// xor with seed
  1043  	PXOR	X0, X8
  1044  	PXOR	X1, X9
  1045  	PXOR	X2, X10
  1046  	PXOR	X3, X11
  1047  	PXOR	X4, X12
  1048  	PXOR	X5, X13
  1049  	PXOR	X6, X14
  1050  	PXOR	X7, X15
  1051  
  1052  	// scramble 3 times
  1053  	AESENC	X8, X8
  1054  	AESENC	X9, X9
  1055  	AESENC	X10, X10
  1056  	AESENC	X11, X11
  1057  	AESENC	X12, X12
  1058  	AESENC	X13, X13
  1059  	AESENC	X14, X14
  1060  	AESENC	X15, X15
  1061  
  1062  	AESENC	X8, X8
  1063  	AESENC	X9, X9
  1064  	AESENC	X10, X10
  1065  	AESENC	X11, X11
  1066  	AESENC	X12, X12
  1067  	AESENC	X13, X13
  1068  	AESENC	X14, X14
  1069  	AESENC	X15, X15
  1070  
  1071  	AESENC	X8, X8
  1072  	AESENC	X9, X9
  1073  	AESENC	X10, X10
  1074  	AESENC	X11, X11
  1075  	AESENC	X12, X12
  1076  	AESENC	X13, X13
  1077  	AESENC	X14, X14
  1078  	AESENC	X15, X15
  1079  
  1080  	// combine results
  1081  	PXOR	X12, X8
  1082  	PXOR	X13, X9
  1083  	PXOR	X14, X10
  1084  	PXOR	X15, X11
  1085  	PXOR	X10, X8
  1086  	PXOR	X11, X9
  1087  	PXOR	X9, X8
  1088  	MOVQ	X8, (DX)
  1089  	RET
  1090  
  1091  aes129plus:
  1092  	// make 7 more starting seeds
  1093  	MOVO	X1, X2
  1094  	MOVO	X1, X3
  1095  	MOVO	X1, X4
  1096  	MOVO	X1, X5
  1097  	MOVO	X1, X6
  1098  	MOVO	X1, X7
  1099  	PXOR	runtime·aeskeysched+16(SB), X1
  1100  	PXOR	runtime·aeskeysched+32(SB), X2
  1101  	PXOR	runtime·aeskeysched+48(SB), X3
  1102  	PXOR	runtime·aeskeysched+64(SB), X4
  1103  	PXOR	runtime·aeskeysched+80(SB), X5
  1104  	PXOR	runtime·aeskeysched+96(SB), X6
  1105  	PXOR	runtime·aeskeysched+112(SB), X7
  1106  	AESENC	X1, X1
  1107  	AESENC	X2, X2
  1108  	AESENC	X3, X3
  1109  	AESENC	X4, X4
  1110  	AESENC	X5, X5
  1111  	AESENC	X6, X6
  1112  	AESENC	X7, X7
  1113  	
  1114  	// start with last (possibly overlapping) block
  1115  	MOVOU	-128(AX)(CX*1), X8
  1116  	MOVOU	-112(AX)(CX*1), X9
  1117  	MOVOU	-96(AX)(CX*1), X10
  1118  	MOVOU	-80(AX)(CX*1), X11
  1119  	MOVOU	-64(AX)(CX*1), X12
  1120  	MOVOU	-48(AX)(CX*1), X13
  1121  	MOVOU	-32(AX)(CX*1), X14
  1122  	MOVOU	-16(AX)(CX*1), X15
  1123  
  1124  	// xor in seed
  1125  	PXOR	X0, X8
  1126  	PXOR	X1, X9
  1127  	PXOR	X2, X10
  1128  	PXOR	X3, X11
  1129  	PXOR	X4, X12
  1130  	PXOR	X5, X13
  1131  	PXOR	X6, X14
  1132  	PXOR	X7, X15
  1133  	
  1134  	// compute number of remaining 128-byte blocks
  1135  	DECQ	CX
  1136  	SHRQ	$7, CX
  1137  	
  1138  aesloop:
  1139  	// scramble state
  1140  	AESENC	X8, X8
  1141  	AESENC	X9, X9
  1142  	AESENC	X10, X10
  1143  	AESENC	X11, X11
  1144  	AESENC	X12, X12
  1145  	AESENC	X13, X13
  1146  	AESENC	X14, X14
  1147  	AESENC	X15, X15
  1148  
  1149  	// scramble state, xor in a block
  1150  	MOVOU	(AX), X0
  1151  	MOVOU	16(AX), X1
  1152  	MOVOU	32(AX), X2
  1153  	MOVOU	48(AX), X3
  1154  	AESENC	X0, X8
  1155  	AESENC	X1, X9
  1156  	AESENC	X2, X10
  1157  	AESENC	X3, X11
  1158  	MOVOU	64(AX), X4
  1159  	MOVOU	80(AX), X5
  1160  	MOVOU	96(AX), X6
  1161  	MOVOU	112(AX), X7
  1162  	AESENC	X4, X12
  1163  	AESENC	X5, X13
  1164  	AESENC	X6, X14
  1165  	AESENC	X7, X15
  1166  
  1167  	ADDQ	$128, AX
  1168  	DECQ	CX
  1169  	JNE	aesloop
  1170  
  1171  	// 3 more scrambles to finish
  1172  	AESENC	X8, X8
  1173  	AESENC	X9, X9
  1174  	AESENC	X10, X10
  1175  	AESENC	X11, X11
  1176  	AESENC	X12, X12
  1177  	AESENC	X13, X13
  1178  	AESENC	X14, X14
  1179  	AESENC	X15, X15
  1180  	AESENC	X8, X8
  1181  	AESENC	X9, X9
  1182  	AESENC	X10, X10
  1183  	AESENC	X11, X11
  1184  	AESENC	X12, X12
  1185  	AESENC	X13, X13
  1186  	AESENC	X14, X14
  1187  	AESENC	X15, X15
  1188  	AESENC	X8, X8
  1189  	AESENC	X9, X9
  1190  	AESENC	X10, X10
  1191  	AESENC	X11, X11
  1192  	AESENC	X12, X12
  1193  	AESENC	X13, X13
  1194  	AESENC	X14, X14
  1195  	AESENC	X15, X15
  1196  
  1197  	PXOR	X12, X8
  1198  	PXOR	X13, X9
  1199  	PXOR	X14, X10
  1200  	PXOR	X15, X11
  1201  	PXOR	X10, X8
  1202  	PXOR	X11, X9
  1203  	PXOR	X9, X8
  1204  	MOVQ	X8, (DX)
  1205  	RET
  1206  	
  1207  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1208  	MOVQ	p+0(FP), AX	// ptr to data
  1209  	MOVQ	h+8(FP), X0	// seed
  1210  	PINSRD	$2, (AX), X0	// data
  1211  	AESENC	runtime·aeskeysched+0(SB), X0
  1212  	AESENC	runtime·aeskeysched+16(SB), X0
  1213  	AESENC	runtime·aeskeysched+32(SB), X0
  1214  	MOVQ	X0, ret+16(FP)
  1215  	RET
  1216  
  1217  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1218  	MOVQ	p+0(FP), AX	// ptr to data
  1219  	MOVQ	h+8(FP), X0	// seed
  1220  	PINSRQ	$1, (AX), X0	// data
  1221  	AESENC	runtime·aeskeysched+0(SB), X0
  1222  	AESENC	runtime·aeskeysched+16(SB), X0
  1223  	AESENC	runtime·aeskeysched+32(SB), X0
  1224  	MOVQ	X0, ret+16(FP)
  1225  	RET
  1226  
  1227  // simple mask to get rid of data in the high part of the register.
  1228  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1229  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1230  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1231  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1232  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1233  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1234  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1235  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1236  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1237  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1238  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1239  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1240  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1241  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1242  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1243  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1244  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1245  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1246  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1247  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1248  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1249  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1250  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1251  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1252  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1253  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1254  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1255  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1256  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1257  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1258  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1259  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1260  GLOBL masks<>(SB),RODATA,$256
  1261  
  1262  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1263  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1264  	MOVQ	$masks<>(SB), AX
  1265  	MOVQ	$shifts<>(SB), BX
  1266  	ORQ	BX, AX
  1267  	TESTQ	$15, AX
  1268  	SETEQ	ret+0(FP)
  1269  	RET
  1270  
  1271  // these are arguments to pshufb. They move data down from
  1272  // the high bytes of the register to the low bytes of the register.
  1273  // index is how many bytes to move.
  1274  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1275  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1276  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1277  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1278  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1279  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1280  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1281  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1282  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1283  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1284  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1285  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1286  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1287  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1288  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1289  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1290  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1291  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1292  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1293  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1294  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1295  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1296  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1297  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1298  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1299  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1300  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1301  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1302  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1303  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1304  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1305  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1306  GLOBL shifts<>(SB),RODATA,$256
  1307  
  1308  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1309  TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1310  	MOVQ	a+0(FP), SI
  1311  	MOVQ	b+8(FP), DI
  1312  	CMPQ	SI, DI
  1313  	JEQ	eq
  1314  	MOVQ	size+16(FP), BX
  1315  	LEAQ	ret+24(FP), AX
  1316  	JMP	runtime·memeqbody(SB)
  1317  eq:
  1318  	MOVB	$1, ret+24(FP)
  1319  	RET
  1320  
  1321  // memequal_varlen(a, b unsafe.Pointer) bool
  1322  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1323  	MOVQ	a+0(FP), SI
  1324  	MOVQ	b+8(FP), DI
  1325  	CMPQ	SI, DI
  1326  	JEQ	eq
  1327  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1328  	LEAQ	ret+16(FP), AX
  1329  	JMP	runtime·memeqbody(SB)
  1330  eq:
  1331  	MOVB	$1, ret+16(FP)
  1332  	RET
  1333  
  1334  // eqstring tests whether two strings are equal.
  1335  // The compiler guarantees that strings passed
  1336  // to eqstring have equal length.
  1337  // See runtime_test.go:eqstring_generic for
  1338  // equivalent Go code.
  1339  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1340  	MOVQ	s1_base+0(FP), SI
  1341  	MOVQ	s2_base+16(FP), DI
  1342  	CMPQ	SI, DI
  1343  	JEQ	eq
  1344  	MOVQ	s1_len+8(FP), BX
  1345  	LEAQ	ret+32(FP), AX
  1346  	JMP	runtime·memeqbody(SB)
  1347  eq:
  1348  	MOVB	$1, ret+32(FP)
  1349  	RET
  1350  
  1351  // a in SI
  1352  // b in DI
  1353  // count in BX
  1354  // address of result byte in AX
  1355  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1356  	CMPQ	BX, $8
  1357  	JB	small
  1358  	CMPQ	BX, $64
  1359  	JB	bigloop
  1360  	CMPB    runtime·support_avx2(SB), $1
  1361  	JE	hugeloop_avx2
  1362  	
  1363  	// 64 bytes at a time using xmm registers
  1364  hugeloop:
  1365  	CMPQ	BX, $64
  1366  	JB	bigloop
  1367  	MOVOU	(SI), X0
  1368  	MOVOU	(DI), X1
  1369  	MOVOU	16(SI), X2
  1370  	MOVOU	16(DI), X3
  1371  	MOVOU	32(SI), X4
  1372  	MOVOU	32(DI), X5
  1373  	MOVOU	48(SI), X6
  1374  	MOVOU	48(DI), X7
  1375  	PCMPEQB	X1, X0
  1376  	PCMPEQB	X3, X2
  1377  	PCMPEQB	X5, X4
  1378  	PCMPEQB	X7, X6
  1379  	PAND	X2, X0
  1380  	PAND	X6, X4
  1381  	PAND	X4, X0
  1382  	PMOVMSKB X0, DX
  1383  	ADDQ	$64, SI
  1384  	ADDQ	$64, DI
  1385  	SUBQ	$64, BX
  1386  	CMPL	DX, $0xffff
  1387  	JEQ	hugeloop
  1388  	MOVB	$0, (AX)
  1389  	RET
  1390  
  1391  	// 64 bytes at a time using ymm registers
  1392  hugeloop_avx2:
  1393  	CMPQ	BX, $64
  1394  	JB	bigloop_avx2
  1395  	VMOVDQU	(SI), Y0
  1396  	VMOVDQU	(DI), Y1
  1397  	VMOVDQU	32(SI), Y2
  1398  	VMOVDQU	32(DI), Y3
  1399  	VPCMPEQB	Y1, Y0, Y4
  1400  	VPCMPEQB	Y2, Y3, Y5
  1401  	VPAND	Y4, Y5, Y6
  1402  	VPMOVMSKB Y6, DX
  1403  	ADDQ	$64, SI
  1404  	ADDQ	$64, DI
  1405  	SUBQ	$64, BX
  1406  	CMPL	DX, $0xffffffff
  1407  	JEQ	hugeloop_avx2
  1408  	VZEROUPPER
  1409  	MOVB	$0, (AX)
  1410  	RET
  1411  
  1412  bigloop_avx2:
  1413  	VZEROUPPER
  1414  
  1415  	// 8 bytes at a time using 64-bit register
  1416  bigloop:
  1417  	CMPQ	BX, $8
  1418  	JBE	leftover
  1419  	MOVQ	(SI), CX
  1420  	MOVQ	(DI), DX
  1421  	ADDQ	$8, SI
  1422  	ADDQ	$8, DI
  1423  	SUBQ	$8, BX
  1424  	CMPQ	CX, DX
  1425  	JEQ	bigloop
  1426  	MOVB	$0, (AX)
  1427  	RET
  1428  
  1429  	// remaining 0-8 bytes
  1430  leftover:
  1431  	MOVQ	-8(SI)(BX*1), CX
  1432  	MOVQ	-8(DI)(BX*1), DX
  1433  	CMPQ	CX, DX
  1434  	SETEQ	(AX)
  1435  	RET
  1436  
  1437  small:
  1438  	CMPQ	BX, $0
  1439  	JEQ	equal
  1440  
  1441  	LEAQ	0(BX*8), CX
  1442  	NEGQ	CX
  1443  
  1444  	CMPB	SI, $0xf8
  1445  	JA	si_high
  1446  
  1447  	// load at SI won't cross a page boundary.
  1448  	MOVQ	(SI), SI
  1449  	JMP	si_finish
  1450  si_high:
  1451  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1452  	MOVQ	-8(SI)(BX*1), SI
  1453  	SHRQ	CX, SI
  1454  si_finish:
  1455  
  1456  	// same for DI.
  1457  	CMPB	DI, $0xf8
  1458  	JA	di_high
  1459  	MOVQ	(DI), DI
  1460  	JMP	di_finish
  1461  di_high:
  1462  	MOVQ	-8(DI)(BX*1), DI
  1463  	SHRQ	CX, DI
  1464  di_finish:
  1465  
  1466  	SUBQ	SI, DI
  1467  	SHLQ	CX, DI
  1468  equal:
  1469  	SETEQ	(AX)
  1470  	RET
  1471  
  1472  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1473  	MOVQ	s1_base+0(FP), SI
  1474  	MOVQ	s1_len+8(FP), BX
  1475  	MOVQ	s2_base+16(FP), DI
  1476  	MOVQ	s2_len+24(FP), DX
  1477  	LEAQ	ret+32(FP), R9
  1478  	JMP	runtime·cmpbody(SB)
  1479  
  1480  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1481  	MOVQ	s1+0(FP), SI
  1482  	MOVQ	s1+8(FP), BX
  1483  	MOVQ	s2+24(FP), DI
  1484  	MOVQ	s2+32(FP), DX
  1485  	LEAQ	res+48(FP), R9
  1486  	JMP	runtime·cmpbody(SB)
  1487  
  1488  // input:
  1489  //   SI = a
  1490  //   DI = b
  1491  //   BX = alen
  1492  //   DX = blen
  1493  //   R9 = address of output word (stores -1/0/1 here)
  1494  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1495  	CMPQ	SI, DI
  1496  	JEQ	allsame
  1497  	CMPQ	BX, DX
  1498  	MOVQ	DX, R8
  1499  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1500  	CMPQ	R8, $8
  1501  	JB	small
  1502  
  1503  	CMPQ	R8, $63
  1504  	JBE	loop
  1505  	CMPB    runtime·support_avx2(SB), $1
  1506  	JEQ     big_loop_avx2
  1507  	JMP	big_loop
  1508  loop:
  1509  	CMPQ	R8, $16
  1510  	JBE	_0through16
  1511  	MOVOU	(SI), X0
  1512  	MOVOU	(DI), X1
  1513  	PCMPEQB X0, X1
  1514  	PMOVMSKB X1, AX
  1515  	XORQ	$0xffff, AX	// convert EQ to NE
  1516  	JNE	diff16	// branch if at least one byte is not equal
  1517  	ADDQ	$16, SI
  1518  	ADDQ	$16, DI
  1519  	SUBQ	$16, R8
  1520  	JMP	loop
  1521  	
  1522  diff64:
  1523  	ADDQ	$48, SI
  1524  	ADDQ	$48, DI
  1525  	JMP	diff16
  1526  diff48:
  1527  	ADDQ	$32, SI
  1528  	ADDQ	$32, DI
  1529  	JMP	diff16
  1530  diff32:
  1531  	ADDQ	$16, SI
  1532  	ADDQ	$16, DI
  1533  	// AX = bit mask of differences
  1534  diff16:
  1535  	BSFQ	AX, BX	// index of first byte that differs
  1536  	XORQ	AX, AX
  1537  	MOVB	(SI)(BX*1), CX
  1538  	CMPB	CX, (DI)(BX*1)
  1539  	SETHI	AX
  1540  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1541  	MOVQ	AX, (R9)
  1542  	RET
  1543  
  1544  	// 0 through 16 bytes left, alen>=8, blen>=8
  1545  _0through16:
  1546  	CMPQ	R8, $8
  1547  	JBE	_0through8
  1548  	MOVQ	(SI), AX
  1549  	MOVQ	(DI), CX
  1550  	CMPQ	AX, CX
  1551  	JNE	diff8
  1552  _0through8:
  1553  	MOVQ	-8(SI)(R8*1), AX
  1554  	MOVQ	-8(DI)(R8*1), CX
  1555  	CMPQ	AX, CX
  1556  	JEQ	allsame
  1557  
  1558  	// AX and CX contain parts of a and b that differ.
  1559  diff8:
  1560  	BSWAPQ	AX	// reverse order of bytes
  1561  	BSWAPQ	CX
  1562  	XORQ	AX, CX
  1563  	BSRQ	CX, CX	// index of highest bit difference
  1564  	SHRQ	CX, AX	// move a's bit to bottom
  1565  	ANDQ	$1, AX	// mask bit
  1566  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1567  	MOVQ	AX, (R9)
  1568  	RET
  1569  
  1570  	// 0-7 bytes in common
  1571  small:
  1572  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1573  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1574  	JEQ	allsame
  1575  
  1576  	// load bytes of a into high bytes of AX
  1577  	CMPB	SI, $0xf8
  1578  	JA	si_high
  1579  	MOVQ	(SI), SI
  1580  	JMP	si_finish
  1581  si_high:
  1582  	MOVQ	-8(SI)(R8*1), SI
  1583  	SHRQ	CX, SI
  1584  si_finish:
  1585  	SHLQ	CX, SI
  1586  
  1587  	// load bytes of b in to high bytes of BX
  1588  	CMPB	DI, $0xf8
  1589  	JA	di_high
  1590  	MOVQ	(DI), DI
  1591  	JMP	di_finish
  1592  di_high:
  1593  	MOVQ	-8(DI)(R8*1), DI
  1594  	SHRQ	CX, DI
  1595  di_finish:
  1596  	SHLQ	CX, DI
  1597  
  1598  	BSWAPQ	SI	// reverse order of bytes
  1599  	BSWAPQ	DI
  1600  	XORQ	SI, DI	// find bit differences
  1601  	JEQ	allsame
  1602  	BSRQ	DI, CX	// index of highest bit difference
  1603  	SHRQ	CX, SI	// move a's bit to bottom
  1604  	ANDQ	$1, SI	// mask bit
  1605  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1606  	MOVQ	AX, (R9)
  1607  	RET
  1608  
  1609  allsame:
  1610  	XORQ	AX, AX
  1611  	XORQ	CX, CX
  1612  	CMPQ	BX, DX
  1613  	SETGT	AX	// 1 if alen > blen
  1614  	SETEQ	CX	// 1 if alen == blen
  1615  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1616  	MOVQ	AX, (R9)
  1617  	RET
  1618  
  1619  	// this works for >= 64 bytes of data.
  1620  big_loop:
  1621  	MOVOU	(SI), X0
  1622  	MOVOU	(DI), X1
  1623  	PCMPEQB X0, X1
  1624  	PMOVMSKB X1, AX
  1625  	XORQ	$0xffff, AX
  1626  	JNE	diff16
  1627  
  1628  	MOVOU	16(SI), X0
  1629  	MOVOU	16(DI), X1
  1630  	PCMPEQB X0, X1
  1631  	PMOVMSKB X1, AX
  1632  	XORQ	$0xffff, AX
  1633  	JNE	diff32
  1634  
  1635  	MOVOU	32(SI), X0
  1636  	MOVOU	32(DI), X1
  1637  	PCMPEQB X0, X1
  1638  	PMOVMSKB X1, AX
  1639  	XORQ	$0xffff, AX
  1640  	JNE	diff48
  1641  
  1642  	MOVOU	48(SI), X0
  1643  	MOVOU	48(DI), X1
  1644  	PCMPEQB X0, X1
  1645  	PMOVMSKB X1, AX
  1646  	XORQ	$0xffff, AX
  1647  	JNE	diff64
  1648  
  1649  	ADDQ	$64, SI
  1650  	ADDQ	$64, DI
  1651  	SUBQ	$64, R8
  1652  	CMPQ	R8, $64
  1653  	JBE	loop
  1654  	JMP	big_loop
  1655  
  1656  	// Compare 64-bytes per loop iteration.
  1657  	// Loop is unrolled and uses AVX2.
  1658  big_loop_avx2:
  1659  	VMOVDQU	(SI), Y2
  1660  	VMOVDQU	(DI), Y3
  1661  	VMOVDQU	32(SI), Y4
  1662  	VMOVDQU	32(DI), Y5
  1663  	VPCMPEQB Y2, Y3, Y0
  1664  	VPMOVMSKB Y0, AX
  1665  	XORL	$0xffffffff, AX
  1666  	JNE	diff32_avx2
  1667  	VPCMPEQB Y4, Y5, Y6
  1668  	VPMOVMSKB Y6, AX
  1669  	XORL	$0xffffffff, AX
  1670  	JNE	diff64_avx2
  1671  
  1672  	ADDQ	$64, SI
  1673  	ADDQ	$64, DI
  1674  	SUBQ	$64, R8
  1675  	CMPQ	R8, $64
  1676  	JB	big_loop_avx2_exit
  1677  	JMP	big_loop_avx2
  1678  
  1679  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1680  diff32_avx2:
  1681  	VZEROUPPER
  1682  	JMP diff16
  1683  
  1684  	// Same as diff32_avx2, but for last 32 bytes.
  1685  diff64_avx2:
  1686  	VZEROUPPER
  1687  	JMP diff48
  1688  
  1689  	// For <64 bytes remainder jump to normal loop.
  1690  big_loop_avx2_exit:
  1691  	VZEROUPPER
  1692  	JMP loop
  1693  
  1694  
  1695  TEXT strings·supportAVX2(SB),NOSPLIT,$0-1
  1696  	MOVBLZX runtime·support_avx2(SB), AX
  1697  	MOVB AX, ret+0(FP)
  1698  	RET
  1699  
  1700  TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1
  1701  	MOVBLZX runtime·support_avx2(SB), AX
  1702  	MOVB AX, ret+0(FP)
  1703  	RET
  1704  
  1705  TEXT strings·supportPOPCNT(SB),NOSPLIT,$0-1
  1706  	MOVBLZX runtime·support_popcnt(SB), AX
  1707  	MOVB AX, ret+0(FP)
  1708  	RET
  1709  
  1710  TEXT bytes·supportPOPCNT(SB),NOSPLIT,$0-1
  1711  	MOVBLZX runtime·support_popcnt(SB), AX
  1712  	MOVB AX, ret+0(FP)
  1713  	RET
  1714  
  1715  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1716  	MOVQ s+0(FP), DI
  1717  	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1718  	MOVQ s_len+8(FP), DX
  1719  	MOVQ c+16(FP), BP
  1720  	MOVQ c_len+24(FP), AX
  1721  	MOVQ DI, R10
  1722  	LEAQ ret+32(FP), R11
  1723  	JMP  runtime·indexShortStr(SB)
  1724  
  1725  TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
  1726  	MOVQ s+0(FP), DI
  1727  	MOVQ s_len+8(FP), DX
  1728  	MOVQ c+24(FP), BP
  1729  	MOVQ c_len+32(FP), AX
  1730  	MOVQ DI, R10
  1731  	LEAQ ret+48(FP), R11
  1732  	JMP  runtime·indexShortStr(SB)
  1733  
  1734  // AX: length of string, that we are searching for
  1735  // DX: length of string, in which we are searching
  1736  // DI: pointer to string, in which we are searching
  1737  // BP: pointer to string, that we are searching for
  1738  // R11: address, where to put return value
  1739  TEXT runtime·indexShortStr(SB),NOSPLIT,$0
  1740  	CMPQ AX, DX
  1741  	JA fail
  1742  	CMPQ DX, $16
  1743  	JAE sse42
  1744  no_sse42:
  1745  	CMPQ AX, $2
  1746  	JA   _3_or_more
  1747  	MOVW (BP), BP
  1748  	LEAQ -1(DI)(DX*1), DX
  1749  loop2:
  1750  	MOVW (DI), SI
  1751  	CMPW SI,BP
  1752  	JZ success
  1753  	ADDQ $1,DI
  1754  	CMPQ DI,DX
  1755  	JB loop2
  1756  	JMP fail
  1757  _3_or_more:
  1758  	CMPQ AX, $3
  1759  	JA   _4_or_more
  1760  	MOVW 1(BP), BX
  1761  	MOVW (BP), BP
  1762  	LEAQ -2(DI)(DX*1), DX
  1763  loop3:
  1764  	MOVW (DI), SI
  1765  	CMPW SI,BP
  1766  	JZ   partial_success3
  1767  	ADDQ $1,DI
  1768  	CMPQ DI,DX
  1769  	JB loop3
  1770  	JMP fail
  1771  partial_success3:
  1772  	MOVW 1(DI), SI
  1773  	CMPW SI,BX
  1774  	JZ success
  1775  	ADDQ $1,DI
  1776  	CMPQ DI,DX
  1777  	JB loop3
  1778  	JMP fail
  1779  _4_or_more:
  1780  	CMPQ AX, $4
  1781  	JA   _5_or_more
  1782  	MOVL (BP), BP
  1783  	LEAQ -3(DI)(DX*1), DX
  1784  loop4:
  1785  	MOVL (DI), SI
  1786  	CMPL SI,BP
  1787  	JZ   success
  1788  	ADDQ $1,DI
  1789  	CMPQ DI,DX
  1790  	JB loop4
  1791  	JMP fail
  1792  _5_or_more:
  1793  	CMPQ AX, $7
  1794  	JA   _8_or_more
  1795  	LEAQ 1(DI)(DX*1), DX
  1796  	SUBQ AX, DX
  1797  	MOVL -4(BP)(AX*1), BX
  1798  	MOVL (BP), BP
  1799  loop5to7:
  1800  	MOVL (DI), SI
  1801  	CMPL SI,BP
  1802  	JZ   partial_success5to7
  1803  	ADDQ $1,DI
  1804  	CMPQ DI,DX
  1805  	JB loop5to7
  1806  	JMP fail
  1807  partial_success5to7:
  1808  	MOVL -4(AX)(DI*1), SI
  1809  	CMPL SI,BX
  1810  	JZ success
  1811  	ADDQ $1,DI
  1812  	CMPQ DI,DX
  1813  	JB loop5to7
  1814  	JMP fail
  1815  _8_or_more:
  1816  	CMPQ AX, $8
  1817  	JA   _9_or_more
  1818  	MOVQ (BP), BP
  1819  	LEAQ -7(DI)(DX*1), DX
  1820  loop8:
  1821  	MOVQ (DI), SI
  1822  	CMPQ SI,BP
  1823  	JZ   success
  1824  	ADDQ $1,DI
  1825  	CMPQ DI,DX
  1826  	JB loop8
  1827  	JMP fail
  1828  _9_or_more:
  1829  	CMPQ AX, $15
  1830  	JA   _16_or_more
  1831  	LEAQ 1(DI)(DX*1), DX
  1832  	SUBQ AX, DX
  1833  	MOVQ -8(BP)(AX*1), BX
  1834  	MOVQ (BP), BP
  1835  loop9to15:
  1836  	MOVQ (DI), SI
  1837  	CMPQ SI,BP
  1838  	JZ   partial_success9to15
  1839  	ADDQ $1,DI
  1840  	CMPQ DI,DX
  1841  	JB loop9to15
  1842  	JMP fail
  1843  partial_success9to15:
  1844  	MOVQ -8(AX)(DI*1), SI
  1845  	CMPQ SI,BX
  1846  	JZ success
  1847  	ADDQ $1,DI
  1848  	CMPQ DI,DX
  1849  	JB loop9to15
  1850  	JMP fail
  1851  _16_or_more:
  1852  	CMPQ AX, $16
  1853  	JA   _17_or_more
  1854  	MOVOU (BP), X1
  1855  	LEAQ -15(DI)(DX*1), DX
  1856  loop16:
  1857  	MOVOU (DI), X2
  1858  	PCMPEQB X1, X2
  1859  	PMOVMSKB X2, SI
  1860  	CMPQ  SI, $0xffff
  1861  	JE   success
  1862  	ADDQ $1,DI
  1863  	CMPQ DI,DX
  1864  	JB loop16
  1865  	JMP fail
  1866  _17_or_more:
  1867  	CMPQ AX, $31
  1868  	JA   _32_or_more
  1869  	LEAQ 1(DI)(DX*1), DX
  1870  	SUBQ AX, DX
  1871  	MOVOU -16(BP)(AX*1), X0
  1872  	MOVOU (BP), X1
  1873  loop17to31:
  1874  	MOVOU (DI), X2
  1875  	PCMPEQB X1,X2
  1876  	PMOVMSKB X2, SI
  1877  	CMPQ  SI, $0xffff
  1878  	JE   partial_success17to31
  1879  	ADDQ $1,DI
  1880  	CMPQ DI,DX
  1881  	JB loop17to31
  1882  	JMP fail
  1883  partial_success17to31:
  1884  	MOVOU -16(AX)(DI*1), X3
  1885  	PCMPEQB X0, X3
  1886  	PMOVMSKB X3, SI
  1887  	CMPQ  SI, $0xffff
  1888  	JE success
  1889  	ADDQ $1,DI
  1890  	CMPQ DI,DX
  1891  	JB loop17to31
  1892  	JMP fail
  1893  // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
  1894  // So no need to check cpuid
  1895  _32_or_more:
  1896  	CMPQ AX, $32
  1897  	JA   _33_to_63
  1898  	VMOVDQU (BP), Y1
  1899  	LEAQ -31(DI)(DX*1), DX
  1900  loop32:
  1901  	VMOVDQU (DI), Y2
  1902  	VPCMPEQB Y1, Y2, Y3
  1903  	VPMOVMSKB Y3, SI
  1904  	CMPL  SI, $0xffffffff
  1905  	JE   success_avx2
  1906  	ADDQ $1,DI
  1907  	CMPQ DI,DX
  1908  	JB loop32
  1909  	JMP fail_avx2
  1910  _33_to_63:
  1911  	LEAQ 1(DI)(DX*1), DX
  1912  	SUBQ AX, DX
  1913  	VMOVDQU -32(BP)(AX*1), Y0
  1914  	VMOVDQU (BP), Y1
  1915  loop33to63:
  1916  	VMOVDQU (DI), Y2
  1917  	VPCMPEQB Y1, Y2, Y3
  1918  	VPMOVMSKB Y3, SI
  1919  	CMPL  SI, $0xffffffff
  1920  	JE   partial_success33to63
  1921  	ADDQ $1,DI
  1922  	CMPQ DI,DX
  1923  	JB loop33to63
  1924  	JMP fail_avx2
  1925  partial_success33to63:
  1926  	VMOVDQU -32(AX)(DI*1), Y3
  1927  	VPCMPEQB Y0, Y3, Y4
  1928  	VPMOVMSKB Y4, SI
  1929  	CMPL  SI, $0xffffffff
  1930  	JE success_avx2
  1931  	ADDQ $1,DI
  1932  	CMPQ DI,DX
  1933  	JB loop33to63
  1934  fail_avx2:
  1935  	VZEROUPPER
  1936  fail:
  1937  	MOVQ $-1, (R11)
  1938  	RET
  1939  success_avx2:
  1940  	VZEROUPPER
  1941  	JMP success
  1942  sse42:
  1943  	MOVL runtime·cpuid_ecx(SB), CX
  1944  	ANDL $0x100000, CX
  1945  	JZ no_sse42
  1946  	CMPQ AX, $12
  1947  	// PCMPESTRI is slower than normal compare,
  1948  	// so using it makes sense only if we advance 4+ bytes per compare
  1949  	// This value was determined experimentally and is the ~same
  1950  	// on Nehalem (first with SSE42) and Haswell.
  1951  	JAE _9_or_more
  1952  	LEAQ 16(BP), SI
  1953  	TESTW $0xff0, SI
  1954  	JEQ no_sse42
  1955  	MOVOU (BP), X1
  1956  	LEAQ -15(DI)(DX*1), SI
  1957  	MOVQ $16, R9
  1958  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1959  loop_sse42:
  1960  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1961  	// for equality (bits 2,3 are 11)
  1962  	// result is not masked or inverted (bits 4,5 are 00)
  1963  	// and corresponds to first matching byte (bit 6 is 0)
  1964  	PCMPESTRI $0x0c, (DI), X1
  1965  	// CX == 16 means no match,
  1966  	// CX > R9 means partial match at the end of the string,
  1967  	// otherwise sep is at offset CX from X1 start
  1968  	CMPQ CX, R9
  1969  	JBE sse42_success
  1970  	ADDQ R9, DI
  1971  	CMPQ DI, SI
  1972  	JB loop_sse42
  1973  	PCMPESTRI $0x0c, -1(SI), X1
  1974  	CMPQ CX, R9
  1975  	JA fail
  1976  	LEAQ -1(SI), DI
  1977  sse42_success:
  1978  	ADDQ CX, DI
  1979  success:
  1980  	SUBQ R10, DI
  1981  	MOVQ DI, (R11)
  1982  	RET
  1983  
  1984  
  1985  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1986  	MOVQ s+0(FP), SI
  1987  	MOVQ s_len+8(FP), BX
  1988  	MOVB c+24(FP), AL
  1989  	LEAQ ret+32(FP), R8
  1990  	JMP  runtime·indexbytebody(SB)
  1991  
  1992  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1993  	MOVQ s+0(FP), SI
  1994  	MOVQ s_len+8(FP), BX
  1995  	MOVB c+16(FP), AL
  1996  	LEAQ ret+24(FP), R8
  1997  	JMP  runtime·indexbytebody(SB)
  1998  
  1999  // input:
  2000  //   SI: data
  2001  //   BX: data len
  2002  //   AL: byte sought
  2003  //   R8: address to put result
  2004  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  2005  	// Shuffle X0 around so that each byte contains
  2006  	// the character we're looking for.
  2007  	MOVD AX, X0
  2008  	PUNPCKLBW X0, X0
  2009  	PUNPCKLBW X0, X0
  2010  	PSHUFL $0, X0, X0
  2011  	
  2012  	CMPQ BX, $16
  2013  	JLT small
  2014  
  2015  	MOVQ SI, DI
  2016  
  2017  	CMPQ BX, $32
  2018  	JA avx2
  2019  sse:
  2020  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2021  	JMP	sseloopentry
  2022  	
  2023  sseloop:
  2024  	// Move the next 16-byte chunk of the data into X1.
  2025  	MOVOU	(DI), X1
  2026  	// Compare bytes in X0 to X1.
  2027  	PCMPEQB	X0, X1
  2028  	// Take the top bit of each byte in X1 and put the result in DX.
  2029  	PMOVMSKB X1, DX
  2030  	// Find first set bit, if any.
  2031  	BSFL	DX, DX
  2032  	JNZ	ssesuccess
  2033  	// Advance to next block.
  2034  	ADDQ	$16, DI
  2035  sseloopentry:
  2036  	CMPQ	DI, AX
  2037  	JB	sseloop
  2038  
  2039  	// Search the last 16-byte chunk. This chunk may overlap with the
  2040  	// chunks we've already searched, but that's ok.
  2041  	MOVQ	AX, DI
  2042  	MOVOU	(AX), X1
  2043  	PCMPEQB	X0, X1
  2044  	PMOVMSKB X1, DX
  2045  	BSFL	DX, DX
  2046  	JNZ	ssesuccess
  2047  
  2048  failure:
  2049  	MOVQ $-1, (R8)
  2050  	RET
  2051  
  2052  // We've found a chunk containing the byte.
  2053  // The chunk was loaded from DI.
  2054  // The index of the matching byte in the chunk is DX.
  2055  // The start of the data is SI.
  2056  ssesuccess:
  2057  	SUBQ SI, DI	// Compute offset of chunk within data.
  2058  	ADDQ DX, DI	// Add offset of byte within chunk.
  2059  	MOVQ DI, (R8)
  2060  	RET
  2061  
  2062  // handle for lengths < 16
  2063  small:
  2064  	TESTQ	BX, BX
  2065  	JEQ	failure
  2066  
  2067  	// Check if we'll load across a page boundary.
  2068  	LEAQ	16(SI), AX
  2069  	TESTW	$0xff0, AX
  2070  	JEQ	endofpage
  2071  
  2072  	MOVOU	(SI), X1 // Load data
  2073  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2074  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2075  	BSFL	DX, DX	// Find first set bit.
  2076  	JZ	failure	// No set bit, failure.
  2077  	CMPL	DX, BX
  2078  	JAE	failure	// Match is past end of data.
  2079  	MOVQ	DX, (R8)
  2080  	RET
  2081  
  2082  endofpage:
  2083  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2084  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2085  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2086  	MOVL	BX, CX
  2087  	SHLL	CX, DX
  2088  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2089  	BSFL	DX, DX	// Find first set bit.
  2090  	JZ	failure	// No set bit, failure.
  2091  	MOVQ	DX, (R8)
  2092  	RET
  2093  
  2094  avx2:
  2095  	CMPB   runtime·support_avx2(SB), $1
  2096  	JNE sse
  2097  	MOVD AX, X0
  2098  	LEAQ -32(SI)(BX*1), R11
  2099  	VPBROADCASTB  X0, Y1
  2100  avx2_loop:
  2101  	VMOVDQU (DI), Y2
  2102  	VPCMPEQB Y1, Y2, Y3
  2103  	VPTEST Y3, Y3
  2104  	JNZ avx2success
  2105  	ADDQ $32, DI
  2106  	CMPQ DI, R11
  2107  	JLT avx2_loop
  2108  	MOVQ R11, DI
  2109  	VMOVDQU (DI), Y2
  2110  	VPCMPEQB Y1, Y2, Y3
  2111  	VPTEST Y3, Y3
  2112  	JNZ avx2success
  2113  	VZEROUPPER
  2114  	MOVQ $-1, (R8)
  2115  	RET
  2116  
  2117  avx2success:
  2118  	VPMOVMSKB Y3, DX
  2119  	BSFL DX, DX
  2120  	SUBQ SI, DI
  2121  	ADDQ DI, DX
  2122  	MOVQ DX, (R8)
  2123  	VZEROUPPER
  2124  	RET
  2125  
  2126  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2127  	MOVQ	a_len+8(FP), BX
  2128  	MOVQ	b_len+32(FP), CX
  2129  	CMPQ	BX, CX
  2130  	JNE	eqret
  2131  	MOVQ	a+0(FP), SI
  2132  	MOVQ	b+24(FP), DI
  2133  	LEAQ	ret+48(FP), AX
  2134  	JMP	runtime·memeqbody(SB)
  2135  eqret:
  2136  	MOVB	$0, ret+48(FP)
  2137  	RET
  2138  
  2139  
  2140  TEXT bytes·countByte(SB),NOSPLIT,$0-40
  2141  	MOVQ s+0(FP), SI
  2142  	MOVQ s_len+8(FP), BX
  2143  	MOVB c+24(FP), AL
  2144  	LEAQ ret+32(FP), R8
  2145  	JMP  runtime·countByte(SB)
  2146  
  2147  TEXT strings·countByte(SB),NOSPLIT,$0-32
  2148  	MOVQ s+0(FP), SI
  2149  	MOVQ s_len+8(FP), BX
  2150  	MOVB c+16(FP), AL
  2151  	LEAQ ret+24(FP), R8
  2152  	JMP  runtime·countByte(SB)
  2153  
  2154  // input:
  2155  //   SI: data
  2156  //   BX: data len
  2157  //   AL: byte sought
  2158  //   R8: address to put result
  2159  // This requires the POPCNT instruction
  2160  TEXT runtime·countByte(SB),NOSPLIT,$0
  2161  	// Shuffle X0 around so that each byte contains
  2162  	// the character we're looking for.
  2163  	MOVD AX, X0
  2164  	PUNPCKLBW X0, X0
  2165  	PUNPCKLBW X0, X0
  2166  	PSHUFL $0, X0, X0
  2167  
  2168  	CMPQ BX, $16
  2169  	JLT small
  2170  
  2171  	MOVQ $0, R12 // Accumulator
  2172  
  2173  	MOVQ SI, DI
  2174  
  2175  	CMPQ BX, $32
  2176  	JA avx2
  2177  sse:
  2178  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2179  	JMP	sseloopentry
  2180  
  2181  sseloop:
  2182  	// Move the next 16-byte chunk of the data into X1.
  2183  	MOVOU	(DI), X1
  2184  	// Compare bytes in X0 to X1.
  2185  	PCMPEQB	X0, X1
  2186  	// Take the top bit of each byte in X1 and put the result in DX.
  2187  	PMOVMSKB X1, DX
  2188  	// Count number of matching bytes
  2189  	POPCNTL DX, DX
  2190  	// Accumulate into R12
  2191  	ADDQ DX, R12
  2192  	// Advance to next block.
  2193  	ADDQ	$16, DI
  2194  sseloopentry:
  2195  	CMPQ	DI, AX
  2196  	JBE	sseloop
  2197  
  2198  	// Get the number of bytes to consider in the last 16 bytes
  2199  	ANDQ $15, BX
  2200  	JZ end
  2201  
  2202  	// Create mask to ignore overlap between previous 16 byte block
  2203  	// and the next.
  2204  	MOVQ $16,CX
  2205  	SUBQ BX, CX
  2206  	MOVQ $0xFFFF, R10
  2207  	SARQ CL, R10
  2208  	SALQ CL, R10
  2209  
  2210  	// Process the last 16-byte chunk. This chunk may overlap with the
  2211  	// chunks we've already searched so we need to mask part of it.
  2212  	MOVOU	(AX), X1
  2213  	PCMPEQB	X0, X1
  2214  	PMOVMSKB X1, DX
  2215  	// Apply mask
  2216  	ANDQ R10, DX
  2217  	POPCNTL DX, DX
  2218  	ADDQ DX, R12
  2219  end:
  2220  	MOVQ R12, (R8)
  2221  	RET
  2222  
  2223  // handle for lengths < 16
  2224  small:
  2225  	TESTQ	BX, BX
  2226  	JEQ	endzero
  2227  
  2228  	// Check if we'll load across a page boundary.
  2229  	LEAQ	16(SI), AX
  2230  	TESTW	$0xff0, AX
  2231  	JEQ	endofpage
  2232  
  2233  	// We must ignore high bytes as they aren't part of our slice.
  2234  	// Create mask.
  2235  	MOVB BX, CX
  2236  	MOVQ $1, R10
  2237  	SALQ CL, R10
  2238  	SUBQ $1, R10
  2239  
  2240  	// Load data
  2241  	MOVOU	(SI), X1
  2242  	// Compare target byte with each byte in data.
  2243  	PCMPEQB	X0, X1
  2244  	// Move result bits to integer register.
  2245  	PMOVMSKB X1, DX
  2246  	// Apply mask
  2247  	ANDQ R10, DX
  2248  	POPCNTL DX, DX
  2249  	// Directly return DX, we don't need to accumulate
  2250  	// since we have <16 bytes.
  2251  	MOVQ	DX, (R8)
  2252  	RET
  2253  endzero:
  2254  	MOVQ $0, (R8)
  2255  	RET
  2256  
  2257  endofpage:
  2258  	// We must ignore low bytes as they aren't part of our slice.
  2259  	MOVQ $16,CX
  2260  	SUBQ BX, CX
  2261  	MOVQ $0xFFFF, R10
  2262  	SARQ CL, R10
  2263  	SALQ CL, R10
  2264  
  2265  	// Load data into the high end of X1.
  2266  	MOVOU	-16(SI)(BX*1), X1
  2267  	// Compare target byte with each byte in data.
  2268  	PCMPEQB	X0, X1
  2269  	// Move result bits to integer register.
  2270  	PMOVMSKB X1, DX
  2271  	// Apply mask
  2272  	ANDQ R10, DX
  2273  	// Directly return DX, we don't need to accumulate
  2274  	// since we have <16 bytes.
  2275  	POPCNTL DX, DX
  2276  	MOVQ	DX, (R8)
  2277  	RET
  2278  
  2279  avx2:
  2280  	CMPB   runtime·support_avx2(SB), $1
  2281  	JNE sse
  2282  	MOVD AX, X0
  2283  	LEAQ -32(SI)(BX*1), R11
  2284  	VPBROADCASTB  X0, Y1
  2285  avx2_loop:
  2286  	VMOVDQU (DI), Y2
  2287  	VPCMPEQB Y1, Y2, Y3
  2288  	VPMOVMSKB Y3, DX
  2289  	POPCNTL DX, DX
  2290  	ADDQ DX, R12
  2291  	ADDQ $32, DI
  2292  	CMPQ DI, R11
  2293  	JLE avx2_loop
  2294  
  2295  	// If last block is already processed,
  2296  	// skip to the end.
  2297  	CMPQ DI, R11
  2298  	JEQ endavx
  2299  
  2300  	// Load address of the last 32 bytes.
  2301  	// There is an overlap with the previous block.
  2302  	MOVQ R11, DI
  2303  	VMOVDQU (DI), Y2
  2304  	VPCMPEQB Y1, Y2, Y3
  2305  	VPMOVMSKB Y3, DX
  2306  	// Exit AVX mode.
  2307  	VZEROUPPER
  2308  
  2309  	// Create mask to ignore overlap between previous 32 byte block
  2310  	// and the next.
  2311  	ANDQ $31, BX
  2312  	MOVQ $32,CX
  2313  	SUBQ BX, CX
  2314  	MOVQ $0xFFFFFFFF, R10
  2315  	SARQ CL, R10
  2316  	SALQ CL, R10
  2317  	// Apply mask
  2318  	ANDQ R10, DX
  2319  	POPCNTL DX, DX
  2320  	ADDQ DX, R12
  2321  	MOVQ R12, (R8)
  2322  	RET
  2323  endavx:
  2324  	// Exit AVX mode.
  2325  	VZEROUPPER
  2326  	MOVQ R12, (R8)
  2327  	RET
  2328  
  2329  TEXT runtime·return0(SB), NOSPLIT, $0
  2330  	MOVL	$0, AX
  2331  	RET
  2332  
  2333  
  2334  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2335  // Must obey the gcc calling convention.
  2336  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2337  	get_tls(CX)
  2338  	MOVQ	g(CX), AX
  2339  	MOVQ	g_m(AX), AX
  2340  	MOVQ	m_curg(AX), AX
  2341  	MOVQ	(g_stack+stack_hi)(AX), AX
  2342  	RET
  2343  
  2344  // The top-most function running on a goroutine
  2345  // returns to goexit+PCQuantum.
  2346  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2347  	BYTE	$0x90	// NOP
  2348  	CALL	runtime·goexit1(SB)	// does not return
  2349  	// traceback from goexit1 must hit code range of goexit
  2350  	BYTE	$0x90	// NOP
  2351  
  2352  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2353  	MOVQ	addr+0(FP), AX
  2354  	PREFETCHT0	(AX)
  2355  	RET
  2356  
  2357  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2358  	MOVQ	addr+0(FP), AX
  2359  	PREFETCHT1	(AX)
  2360  	RET
  2361  
  2362  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2363  	MOVQ	addr+0(FP), AX
  2364  	PREFETCHT2	(AX)
  2365  	RET
  2366  
  2367  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2368  	MOVQ	addr+0(FP), AX
  2369  	PREFETCHNTA	(AX)
  2370  	RET
  2371  
  2372  // This is called from .init_array and follows the platform, not Go, ABI.
  2373  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2374  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2375  	MOVQ	runtime·lastmoduledatap(SB), AX
  2376  	MOVQ	DI, moduledata_next(AX)
  2377  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2378  	POPQ	R15
  2379  	RET