github.com/sanprasirt/go@v0.0.0-20170607001320-a027466e4b6d/src/runtime/asm_amd64.s

github.com/sanprasirt/go@v0.0.0-20170607001320-a027466e4b6d/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVL	$0, AX
    30  	CPUID
    31  	MOVL	AX, SI
    32  	CMPL	AX, $0
    33  	JE	nocpuinfo
    34  
    35  	// Figure out how to serialize RDTSC.
    36  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37  	// Don't know about the rest, so let's do MFENCE.
    38  	CMPL	BX, $0x756E6547  // "Genu"
    39  	JNE	notintel
    40  	CMPL	DX, $0x49656E69  // "ineI"
    41  	JNE	notintel
    42  	CMPL	CX, $0x6C65746E  // "ntel"
    43  	JNE	notintel
    44  	MOVB	$1, runtime·isIntel(SB)
    45  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    46  notintel:
    47  
    48  	// Load EAX=1 cpuid flags
    49  	MOVL	$1, AX
    50  	CPUID
    51  	MOVL	AX, runtime·processorVersionInfo(SB)
    52  
    53  	TESTL	$(1<<26), DX // SSE2
    54  	SETNE	runtime·support_sse2(SB)
    55  
    56  	TESTL	$(1<<9), CX // SSSE3
    57  	SETNE	runtime·support_ssse3(SB)
    58  
    59  	TESTL	$(1<<19), CX // SSE4.1
    60  	SETNE	runtime·support_sse41(SB)
    61  
    62  	TESTL	$(1<<20), CX // SSE4.2
    63  	SETNE	runtime·support_sse42(SB)
    64  
    65  	TESTL	$(1<<23), CX // POPCNT
    66  	SETNE	runtime·support_popcnt(SB)
    67  
    68  	TESTL	$(1<<25), CX // AES
    69  	SETNE	runtime·support_aes(SB)
    70  
    71  	TESTL	$(1<<27), CX // OSXSAVE
    72  	SETNE	runtime·support_osxsave(SB)
    73  
    74  	// If OS support for XMM and YMM is not present
    75  	// support_avx will be set back to false later.
    76  	TESTL	$(1<<28), CX // AVX
    77  	SETNE	runtime·support_avx(SB)
    78  
    79  eax7:
    80  	// Load EAX=7/ECX=0 cpuid flags
    81  	CMPL	SI, $7
    82  	JLT	osavx
    83  	MOVL	$7, AX
    84  	MOVL	$0, CX
    85  	CPUID
    86  
    87  	TESTL	$(1<<3), BX // BMI1
    88  	SETNE	runtime·support_bmi1(SB)
    89  
    90  	// If OS support for XMM and YMM is not present
    91  	// support_avx2 will be set back to false later.
    92  	TESTL	$(1<<5), BX
    93  	SETNE	runtime·support_avx2(SB)
    94  
    95  	TESTL	$(1<<8), BX // BMI2
    96  	SETNE	runtime·support_bmi2(SB)
    97  
    98  	TESTL	$(1<<9), BX // ERMS
    99  	SETNE	runtime·support_erms(SB)
   100  
   101  osavx:
   102  	CMPB	runtime·support_osxsave(SB), $1
   103  	JNE	noavx
   104  	MOVL	$0, CX
   105  	// For XGETBV, OSXSAVE bit is required and sufficient
   106  	XGETBV
   107  	ANDL	$6, AX
   108  	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
   109  	JE nocpuinfo
   110  noavx:
   111  	MOVB $0, runtime·support_avx(SB)
   112  	MOVB $0, runtime·support_avx2(SB)
   113  
   114  nocpuinfo:
   115  	// if there is an _cgo_init, call it.
   116  	MOVQ	_cgo_init(SB), AX
   117  	TESTQ	AX, AX
   118  	JZ	needtls
   119  	// g0 already in DI
   120  	MOVQ	DI, CX	// Win64 uses CX for first parameter
   121  	MOVQ	$setg_gcc<>(SB), SI
   122  	CALL	AX
   123  
   124  	// update stackguard after _cgo_init
   125  	MOVQ	$runtime·g0(SB), CX
   126  	MOVQ	(g_stack+stack_lo)(CX), AX
   127  	ADDQ	$const__StackGuard, AX
   128  	MOVQ	AX, g_stackguard0(CX)
   129  	MOVQ	AX, g_stackguard1(CX)
   130  
   131  #ifndef GOOS_windows
   132  	JMP ok
   133  #endif
   134  needtls:
   135  #ifdef GOOS_plan9
   136  	// skip TLS setup on Plan 9
   137  	JMP ok
   138  #endif
   139  #ifdef GOOS_solaris
   140  	// skip TLS setup on Solaris
   141  	JMP ok
   142  #endif
   143  
   144  	LEAQ	runtime·m0+m_tls(SB), DI
   145  	CALL	runtime·settls(SB)
   146  
   147  	// store through it, to make sure it works
   148  	get_tls(BX)
   149  	MOVQ	$0x123, g(BX)
   150  	MOVQ	runtime·m0+m_tls(SB), AX
   151  	CMPQ	AX, $0x123
   152  	JEQ 2(PC)
   153  	MOVL	AX, 0	// abort
   154  ok:
   155  	// set the per-goroutine and per-mach "registers"
   156  	get_tls(BX)
   157  	LEAQ	runtime·g0(SB), CX
   158  	MOVQ	CX, g(BX)
   159  	LEAQ	runtime·m0(SB), AX
   160  
   161  	// save m->g0 = g0
   162  	MOVQ	CX, m_g0(AX)
   163  	// save m0 to g0->m
   164  	MOVQ	AX, g_m(CX)
   165  
   166  	CLD				// convention is D is always left cleared
   167  	CALL	runtime·check(SB)
   168  
   169  	MOVL	16(SP), AX		// copy argc
   170  	MOVL	AX, 0(SP)
   171  	MOVQ	24(SP), AX		// copy argv
   172  	MOVQ	AX, 8(SP)
   173  	CALL	runtime·args(SB)
   174  	CALL	runtime·osinit(SB)
   175  	CALL	runtime·schedinit(SB)
   176  
   177  	// create a new goroutine to start program
   178  	MOVQ	$runtime·mainPC(SB), AX		// entry
   179  	PUSHQ	AX
   180  	PUSHQ	$0			// arg size
   181  	CALL	runtime·newproc(SB)
   182  	POPQ	AX
   183  	POPQ	AX
   184  
   185  	// start this M
   186  	CALL	runtime·mstart(SB)
   187  
   188  	MOVL	$0xf1, 0xf1  // crash
   189  	RET
   190  
   191  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   192  GLOBL	runtime·mainPC(SB),RODATA,$8
   193  
   194  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   195  	BYTE	$0xcc
   196  	RET
   197  
   198  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   199  	// No per-thread init.
   200  	RET
   201  
   202  /*
   203   *  go-routine
   204   */
   205  
   206  // void gosave(Gobuf*)
   207  // save state in Gobuf; setjmp
   208  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   209  	MOVQ	buf+0(FP), AX		// gobuf
   210  	LEAQ	buf+0(FP), BX		// caller's SP
   211  	MOVQ	BX, gobuf_sp(AX)
   212  	MOVQ	0(SP), BX		// caller's PC
   213  	MOVQ	BX, gobuf_pc(AX)
   214  	MOVQ	$0, gobuf_ret(AX)
   215  	MOVQ	BP, gobuf_bp(AX)
   216  	// Assert ctxt is zero. See func save.
   217  	MOVQ	gobuf_ctxt(AX), BX
   218  	TESTQ	BX, BX
   219  	JZ	2(PC)
   220  	CALL	runtime·badctxt(SB)
   221  	get_tls(CX)
   222  	MOVQ	g(CX), BX
   223  	MOVQ	BX, gobuf_g(AX)
   224  	RET
   225  
   226  // void gogo(Gobuf*)
   227  // restore state from Gobuf; longjmp
   228  TEXT runtime·gogo(SB), NOSPLIT, $16-8
   229  	MOVQ	buf+0(FP), BX		// gobuf
   230  
   231  	// If ctxt is not nil, invoke deletion barrier before overwriting.
   232  	MOVQ	gobuf_ctxt(BX), AX
   233  	TESTQ	AX, AX
   234  	JZ	nilctxt
   235  	LEAQ	gobuf_ctxt(BX), AX
   236  	MOVQ	AX, 0(SP)
   237  	MOVQ	$0, 8(SP)
   238  	CALL	runtime·writebarrierptr_prewrite(SB)
   239  	MOVQ	buf+0(FP), BX
   240  
   241  nilctxt:
   242  	MOVQ	gobuf_g(BX), DX
   243  	MOVQ	0(DX), CX		// make sure g != nil
   244  	get_tls(CX)
   245  	MOVQ	DX, g(CX)
   246  	MOVQ	gobuf_sp(BX), SP	// restore SP
   247  	MOVQ	gobuf_ret(BX), AX
   248  	MOVQ	gobuf_ctxt(BX), DX
   249  	MOVQ	gobuf_bp(BX), BP
   250  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   251  	MOVQ	$0, gobuf_ret(BX)
   252  	MOVQ	$0, gobuf_ctxt(BX)
   253  	MOVQ	$0, gobuf_bp(BX)
   254  	MOVQ	gobuf_pc(BX), BX
   255  	JMP	BX
   256  
   257  // func mcall(fn func(*g))
   258  // Switch to m->g0's stack, call fn(g).
   259  // Fn must never return. It should gogo(&g->sched)
   260  // to keep running g.
   261  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   262  	MOVQ	fn+0(FP), DI
   263  	
   264  	get_tls(CX)
   265  	MOVQ	g(CX), AX	// save state in g->sched
   266  	MOVQ	0(SP), BX	// caller's PC
   267  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   268  	LEAQ	fn+0(FP), BX	// caller's SP
   269  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   270  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   271  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   272  
   273  	// switch to m->g0 & its stack, call fn
   274  	MOVQ	g(CX), BX
   275  	MOVQ	g_m(BX), BX
   276  	MOVQ	m_g0(BX), SI
   277  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   278  	JNE	3(PC)
   279  	MOVQ	$runtime·badmcall(SB), AX
   280  	JMP	AX
   281  	MOVQ	SI, g(CX)	// g = m->g0
   282  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   283  	PUSHQ	AX
   284  	MOVQ	DI, DX
   285  	MOVQ	0(DI), DI
   286  	CALL	DI
   287  	POPQ	AX
   288  	MOVQ	$runtime·badmcall2(SB), AX
   289  	JMP	AX
   290  	RET
   291  
   292  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   293  // of the G stack. We need to distinguish the routine that
   294  // lives at the bottom of the G stack from the one that lives
   295  // at the top of the system stack because the one at the top of
   296  // the system stack terminates the stack walk (see topofstack()).
   297  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   298  	RET
   299  
   300  // func systemstack(fn func())
   301  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   302  	MOVQ	fn+0(FP), DI	// DI = fn
   303  	get_tls(CX)
   304  	MOVQ	g(CX), AX	// AX = g
   305  	MOVQ	g_m(AX), BX	// BX = m
   306  
   307  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   308  	CMPQ	AX, DX
   309  	JEQ	noswitch
   310  
   311  	MOVQ	m_g0(BX), DX	// DX = g0
   312  	CMPQ	AX, DX
   313  	JEQ	noswitch
   314  
   315  	MOVQ	m_curg(BX), R8
   316  	CMPQ	AX, R8
   317  	JEQ	switch
   318  	
   319  	// Bad: g is not gsignal, not g0, not curg. What is it?
   320  	MOVQ	$runtime·badsystemstack(SB), AX
   321  	CALL	AX
   322  
   323  switch:
   324  	// save our state in g->sched. Pretend to
   325  	// be systemstack_switch if the G stack is scanned.
   326  	MOVQ	$runtime·systemstack_switch(SB), SI
   327  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   328  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   329  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   330  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   331  
   332  	// switch to g0
   333  	MOVQ	DX, g(CX)
   334  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   335  	// make it look like mstart called systemstack on g0, to stop traceback
   336  	SUBQ	$8, BX
   337  	MOVQ	$runtime·mstart(SB), DX
   338  	MOVQ	DX, 0(BX)
   339  	MOVQ	BX, SP
   340  
   341  	// call target function
   342  	MOVQ	DI, DX
   343  	MOVQ	0(DI), DI
   344  	CALL	DI
   345  
   346  	// switch back to g
   347  	get_tls(CX)
   348  	MOVQ	g(CX), AX
   349  	MOVQ	g_m(AX), BX
   350  	MOVQ	m_curg(BX), AX
   351  	MOVQ	AX, g(CX)
   352  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   353  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   354  	RET
   355  
   356  noswitch:
   357  	// already on m stack, just call directly
   358  	MOVQ	DI, DX
   359  	MOVQ	0(DI), DI
   360  	CALL	DI
   361  	RET
   362  
   363  /*
   364   * support for morestack
   365   */
   366  
   367  // Called during function prolog when more stack is needed.
   368  //
   369  // The traceback routines see morestack on a g0 as being
   370  // the top of a stack (for example, morestack calling newstack
   371  // calling the scheduler calling newm calling gc), so we must
   372  // record an argument size. For that purpose, it has no arguments.
   373  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   374  	// Cannot grow scheduler stack (m->g0).
   375  	get_tls(CX)
   376  	MOVQ	g(CX), BX
   377  	MOVQ	g_m(BX), BX
   378  	MOVQ	m_g0(BX), SI
   379  	CMPQ	g(CX), SI
   380  	JNE	3(PC)
   381  	CALL	runtime·badmorestackg0(SB)
   382  	INT	$3
   383  
   384  	// Cannot grow signal stack (m->gsignal).
   385  	MOVQ	m_gsignal(BX), SI
   386  	CMPQ	g(CX), SI
   387  	JNE	3(PC)
   388  	CALL	runtime·badmorestackgsignal(SB)
   389  	INT	$3
   390  
   391  	// Called from f.
   392  	// Set m->morebuf to f's caller.
   393  	MOVQ	8(SP), AX	// f's caller's PC
   394  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   395  	LEAQ	16(SP), AX	// f's caller's SP
   396  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   397  	get_tls(CX)
   398  	MOVQ	g(CX), SI
   399  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   400  
   401  	// Set g->sched to context in f.
   402  	MOVQ	0(SP), AX // f's PC
   403  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   404  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   405  	LEAQ	8(SP), AX // f's SP
   406  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   407  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   408  	// newstack will fill gobuf.ctxt.
   409  
   410  	// Call newstack on m->g0's stack.
   411  	MOVQ	m_g0(BX), BX
   412  	MOVQ	BX, g(CX)
   413  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   414  	PUSHQ	DX	// ctxt argument
   415  	CALL	runtime·newstack(SB)
   416  	MOVQ	$0, 0x1003	// crash if newstack returns
   417  	POPQ	DX	// keep balance check happy
   418  	RET
   419  
   420  // morestack but not preserving ctxt.
   421  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   422  	MOVL	$0, DX
   423  	JMP	runtime·morestack(SB)
   424  
   425  // reflectcall: call a function with the given argument list
   426  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   427  // we don't have variable-sized frames, so we use a small number
   428  // of constant-sized-frame functions to encode a few bits of size in the pc.
   429  // Caution: ugly multiline assembly macros in your future!
   430  
   431  #define DISPATCH(NAME,MAXSIZE)		\
   432  	CMPQ	CX, $MAXSIZE;		\
   433  	JA	3(PC);			\
   434  	MOVQ	$NAME(SB), AX;		\
   435  	JMP	AX
   436  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   437  
   438  TEXT reflect·call(SB), NOSPLIT, $0-0
   439  	JMP	·reflectcall(SB)
   440  
   441  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   442  	MOVLQZX argsize+24(FP), CX
   443  	DISPATCH(runtime·call32, 32)
   444  	DISPATCH(runtime·call64, 64)
   445  	DISPATCH(runtime·call128, 128)
   446  	DISPATCH(runtime·call256, 256)
   447  	DISPATCH(runtime·call512, 512)
   448  	DISPATCH(runtime·call1024, 1024)
   449  	DISPATCH(runtime·call2048, 2048)
   450  	DISPATCH(runtime·call4096, 4096)
   451  	DISPATCH(runtime·call8192, 8192)
   452  	DISPATCH(runtime·call16384, 16384)
   453  	DISPATCH(runtime·call32768, 32768)
   454  	DISPATCH(runtime·call65536, 65536)
   455  	DISPATCH(runtime·call131072, 131072)
   456  	DISPATCH(runtime·call262144, 262144)
   457  	DISPATCH(runtime·call524288, 524288)
   458  	DISPATCH(runtime·call1048576, 1048576)
   459  	DISPATCH(runtime·call2097152, 2097152)
   460  	DISPATCH(runtime·call4194304, 4194304)
   461  	DISPATCH(runtime·call8388608, 8388608)
   462  	DISPATCH(runtime·call16777216, 16777216)
   463  	DISPATCH(runtime·call33554432, 33554432)
   464  	DISPATCH(runtime·call67108864, 67108864)
   465  	DISPATCH(runtime·call134217728, 134217728)
   466  	DISPATCH(runtime·call268435456, 268435456)
   467  	DISPATCH(runtime·call536870912, 536870912)
   468  	DISPATCH(runtime·call1073741824, 1073741824)
   469  	MOVQ	$runtime·badreflectcall(SB), AX
   470  	JMP	AX
   471  
   472  #define CALLFN(NAME,MAXSIZE)			\
   473  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   474  	NO_LOCAL_POINTERS;			\
   475  	/* copy arguments to stack */		\
   476  	MOVQ	argptr+16(FP), SI;		\
   477  	MOVLQZX argsize+24(FP), CX;		\
   478  	MOVQ	SP, DI;				\
   479  	REP;MOVSB;				\
   480  	/* call function */			\
   481  	MOVQ	f+8(FP), DX;			\
   482  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   483  	CALL	(DX);				\
   484  	/* copy return values back */		\
   485  	MOVQ	argtype+0(FP), DX;		\
   486  	MOVQ	argptr+16(FP), DI;		\
   487  	MOVLQZX	argsize+24(FP), CX;		\
   488  	MOVLQZX	retoffset+28(FP), BX;		\
   489  	MOVQ	SP, SI;				\
   490  	ADDQ	BX, DI;				\
   491  	ADDQ	BX, SI;				\
   492  	SUBQ	BX, CX;				\
   493  	CALL	callRet<>(SB);			\
   494  	RET
   495  
   496  // callRet copies return values back at the end of call*. This is a
   497  // separate function so it can allocate stack space for the arguments
   498  // to reflectcallmove. It does not follow the Go ABI; it expects its
   499  // arguments in registers.
   500  TEXT callRet<>(SB), NOSPLIT, $32-0
   501  	NO_LOCAL_POINTERS
   502  	MOVQ	DX, 0(SP)
   503  	MOVQ	DI, 8(SP)
   504  	MOVQ	SI, 16(SP)
   505  	MOVQ	CX, 24(SP)
   506  	CALL	runtime·reflectcallmove(SB)
   507  	RET
   508  
   509  CALLFN(·call32, 32)
   510  CALLFN(·call64, 64)
   511  CALLFN(·call128, 128)
   512  CALLFN(·call256, 256)
   513  CALLFN(·call512, 512)
   514  CALLFN(·call1024, 1024)
   515  CALLFN(·call2048, 2048)
   516  CALLFN(·call4096, 4096)
   517  CALLFN(·call8192, 8192)
   518  CALLFN(·call16384, 16384)
   519  CALLFN(·call32768, 32768)
   520  CALLFN(·call65536, 65536)
   521  CALLFN(·call131072, 131072)
   522  CALLFN(·call262144, 262144)
   523  CALLFN(·call524288, 524288)
   524  CALLFN(·call1048576, 1048576)
   525  CALLFN(·call2097152, 2097152)
   526  CALLFN(·call4194304, 4194304)
   527  CALLFN(·call8388608, 8388608)
   528  CALLFN(·call16777216, 16777216)
   529  CALLFN(·call33554432, 33554432)
   530  CALLFN(·call67108864, 67108864)
   531  CALLFN(·call134217728, 134217728)
   532  CALLFN(·call268435456, 268435456)
   533  CALLFN(·call536870912, 536870912)
   534  CALLFN(·call1073741824, 1073741824)
   535  
   536  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   537  	MOVL	cycles+0(FP), AX
   538  again:
   539  	PAUSE
   540  	SUBL	$1, AX
   541  	JNZ	again
   542  	RET
   543  
   544  
   545  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   546  	// Stores are already ordered on x86, so this is just a
   547  	// compile barrier.
   548  	RET
   549  
   550  // void jmpdefer(fn, sp);
   551  // called from deferreturn.
   552  // 1. pop the caller
   553  // 2. sub 5 bytes from the callers return
   554  // 3. jmp to the argument
   555  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   556  	MOVQ	fv+0(FP), DX	// fn
   557  	MOVQ	argp+8(FP), BX	// caller sp
   558  	LEAQ	-8(BX), SP	// caller sp after CALL
   559  	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   560  	SUBQ	$5, (SP)	// return to CALL again
   561  	MOVQ	0(DX), BX
   562  	JMP	BX	// but first run the deferred function
   563  
   564  // Save state of caller into g->sched. Smashes R8, R9.
   565  TEXT gosave<>(SB),NOSPLIT,$0
   566  	get_tls(R8)
   567  	MOVQ	g(R8), R8
   568  	MOVQ	0(SP), R9
   569  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   570  	LEAQ	8(SP), R9
   571  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   572  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   573  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   574  	// Assert ctxt is zero. See func save.
   575  	MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   576  	TESTQ	R9, R9
   577  	JZ	2(PC)
   578  	CALL	runtime·badctxt(SB)
   579  	RET
   580  
   581  // func asmcgocall(fn, arg unsafe.Pointer) int32
   582  // Call fn(arg) on the scheduler stack,
   583  // aligned appropriately for the gcc ABI.
   584  // See cgocall.go for more details.
   585  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   586  	MOVQ	fn+0(FP), AX
   587  	MOVQ	arg+8(FP), BX
   588  
   589  	MOVQ	SP, DX
   590  
   591  	// Figure out if we need to switch to m->g0 stack.
   592  	// We get called to create new OS threads too, and those
   593  	// come in on the m->g0 stack already.
   594  	get_tls(CX)
   595  	MOVQ	g(CX), R8
   596  	CMPQ	R8, $0
   597  	JEQ	nosave
   598  	MOVQ	g_m(R8), R8
   599  	MOVQ	m_g0(R8), SI
   600  	MOVQ	g(CX), DI
   601  	CMPQ	SI, DI
   602  	JEQ	nosave
   603  	MOVQ	m_gsignal(R8), SI
   604  	CMPQ	SI, DI
   605  	JEQ	nosave
   606  	
   607  	// Switch to system stack.
   608  	MOVQ	m_g0(R8), SI
   609  	CALL	gosave<>(SB)
   610  	MOVQ	SI, g(CX)
   611  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   612  
   613  	// Now on a scheduling stack (a pthread-created stack).
   614  	// Make sure we have enough room for 4 stack-backed fast-call
   615  	// registers as per windows amd64 calling convention.
   616  	SUBQ	$64, SP
   617  	ANDQ	$~15, SP	// alignment for gcc ABI
   618  	MOVQ	DI, 48(SP)	// save g
   619  	MOVQ	(g_stack+stack_hi)(DI), DI
   620  	SUBQ	DX, DI
   621  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   622  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   623  	MOVQ	BX, CX		// CX = first argument in Win64
   624  	CALL	AX
   625  
   626  	// Restore registers, g, stack pointer.
   627  	get_tls(CX)
   628  	MOVQ	48(SP), DI
   629  	MOVQ	(g_stack+stack_hi)(DI), SI
   630  	SUBQ	40(SP), SI
   631  	MOVQ	DI, g(CX)
   632  	MOVQ	SI, SP
   633  
   634  	MOVL	AX, ret+16(FP)
   635  	RET
   636  
   637  nosave:
   638  	// Running on a system stack, perhaps even without a g.
   639  	// Having no g can happen during thread creation or thread teardown
   640  	// (see needm/dropm on Solaris, for example).
   641  	// This code is like the above sequence but without saving/restoring g
   642  	// and without worrying about the stack moving out from under us
   643  	// (because we're on a system stack, not a goroutine stack).
   644  	// The above code could be used directly if already on a system stack,
   645  	// but then the only path through this code would be a rare case on Solaris.
   646  	// Using this code for all "already on system stack" calls exercises it more,
   647  	// which should help keep it correct.
   648  	SUBQ	$64, SP
   649  	ANDQ	$~15, SP
   650  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   651  	MOVQ	DX, 40(SP)	// save original stack pointer
   652  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   653  	MOVQ	BX, CX		// CX = first argument in Win64
   654  	CALL	AX
   655  	MOVQ	40(SP), SI	// restore original stack pointer
   656  	MOVQ	SI, SP
   657  	MOVL	AX, ret+16(FP)
   658  	RET
   659  
   660  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   661  // Turn the fn into a Go func (by taking its address) and call
   662  // cgocallback_gofunc.
   663  TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   664  	LEAQ	fn+0(FP), AX
   665  	MOVQ	AX, 0(SP)
   666  	MOVQ	frame+8(FP), AX
   667  	MOVQ	AX, 8(SP)
   668  	MOVQ	framesize+16(FP), AX
   669  	MOVQ	AX, 16(SP)
   670  	MOVQ	ctxt+24(FP), AX
   671  	MOVQ	AX, 24(SP)
   672  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   673  	CALL	AX
   674  	RET
   675  
   676  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   677  // See cgocall.go for more details.
   678  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   679  	NO_LOCAL_POINTERS
   680  
   681  	// If g is nil, Go did not create the current thread.
   682  	// Call needm to obtain one m for temporary use.
   683  	// In this case, we're running on the thread stack, so there's
   684  	// lots of space, but the linker doesn't know. Hide the call from
   685  	// the linker analysis by using an indirect call through AX.
   686  	get_tls(CX)
   687  #ifdef GOOS_windows
   688  	MOVL	$0, BX
   689  	CMPQ	CX, $0
   690  	JEQ	2(PC)
   691  #endif
   692  	MOVQ	g(CX), BX
   693  	CMPQ	BX, $0
   694  	JEQ	needm
   695  	MOVQ	g_m(BX), BX
   696  	MOVQ	BX, R8 // holds oldm until end of function
   697  	JMP	havem
   698  needm:
   699  	MOVQ	$0, 0(SP)
   700  	MOVQ	$runtime·needm(SB), AX
   701  	CALL	AX
   702  	MOVQ	0(SP), R8
   703  	get_tls(CX)
   704  	MOVQ	g(CX), BX
   705  	MOVQ	g_m(BX), BX
   706  	
   707  	// Set m->sched.sp = SP, so that if a panic happens
   708  	// during the function we are about to execute, it will
   709  	// have a valid SP to run on the g0 stack.
   710  	// The next few lines (after the havem label)
   711  	// will save this SP onto the stack and then write
   712  	// the same SP back to m->sched.sp. That seems redundant,
   713  	// but if an unrecovered panic happens, unwindm will
   714  	// restore the g->sched.sp from the stack location
   715  	// and then systemstack will try to use it. If we don't set it here,
   716  	// that restored SP will be uninitialized (typically 0) and
   717  	// will not be usable.
   718  	MOVQ	m_g0(BX), SI
   719  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   720  
   721  havem:
   722  	// Now there's a valid m, and we're running on its m->g0.
   723  	// Save current m->g0->sched.sp on stack and then set it to SP.
   724  	// Save current sp in m->g0->sched.sp in preparation for
   725  	// switch back to m->curg stack.
   726  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   727  	MOVQ	m_g0(BX), SI
   728  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   729  	MOVQ	AX, 0(SP)
   730  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   731  
   732  	// Switch to m->curg stack and call runtime.cgocallbackg.
   733  	// Because we are taking over the execution of m->curg
   734  	// but *not* resuming what had been running, we need to
   735  	// save that information (m->curg->sched) so we can restore it.
   736  	// We can restore m->curg->sched.sp easily, because calling
   737  	// runtime.cgocallbackg leaves SP unchanged upon return.
   738  	// To save m->curg->sched.pc, we push it onto the stack.
   739  	// This has the added benefit that it looks to the traceback
   740  	// routine like cgocallbackg is going to return to that
   741  	// PC (because the frame we allocate below has the same
   742  	// size as cgocallback_gofunc's frame declared above)
   743  	// so that the traceback will seamlessly trace back into
   744  	// the earlier calls.
   745  	//
   746  	// In the new goroutine, 8(SP) holds the saved R8.
   747  	MOVQ	m_curg(BX), SI
   748  	MOVQ	SI, g(CX)
   749  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   750  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   751  	MOVQ	BX, -8(DI)
   752  	// Compute the size of the frame, including return PC and, if
   753  	// GOEXPERIMENT=framepointer, the saved base pointer
   754  	MOVQ	ctxt+24(FP), BX
   755  	LEAQ	fv+0(FP), AX
   756  	SUBQ	SP, AX
   757  	SUBQ	AX, DI
   758  	MOVQ	DI, SP
   759  
   760  	MOVQ	R8, 8(SP)
   761  	MOVQ	BX, 0(SP)
   762  	CALL	runtime·cgocallbackg(SB)
   763  	MOVQ	8(SP), R8
   764  
   765  	// Compute the size of the frame again. FP and SP have
   766  	// completely different values here than they did above,
   767  	// but only their difference matters.
   768  	LEAQ	fv+0(FP), AX
   769  	SUBQ	SP, AX
   770  
   771  	// Restore g->sched (== m->curg->sched) from saved values.
   772  	get_tls(CX)
   773  	MOVQ	g(CX), SI
   774  	MOVQ	SP, DI
   775  	ADDQ	AX, DI
   776  	MOVQ	-8(DI), BX
   777  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   778  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   779  
   780  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   781  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   782  	// so we do not have to restore it.)
   783  	MOVQ	g(CX), BX
   784  	MOVQ	g_m(BX), BX
   785  	MOVQ	m_g0(BX), SI
   786  	MOVQ	SI, g(CX)
   787  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   788  	MOVQ	0(SP), AX
   789  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   790  	
   791  	// If the m on entry was nil, we called needm above to borrow an m
   792  	// for the duration of the call. Since the call is over, return it with dropm.
   793  	CMPQ	R8, $0
   794  	JNE 3(PC)
   795  	MOVQ	$runtime·dropm(SB), AX
   796  	CALL	AX
   797  
   798  	// Done!
   799  	RET
   800  
   801  // void setg(G*); set g. for use by needm.
   802  TEXT runtime·setg(SB), NOSPLIT, $0-8
   803  	MOVQ	gg+0(FP), BX
   804  #ifdef GOOS_windows
   805  	CMPQ	BX, $0
   806  	JNE	settls
   807  	MOVQ	$0, 0x28(GS)
   808  	RET
   809  settls:
   810  	MOVQ	g_m(BX), AX
   811  	LEAQ	m_tls(AX), AX
   812  	MOVQ	AX, 0x28(GS)
   813  #endif
   814  	get_tls(CX)
   815  	MOVQ	BX, g(CX)
   816  	RET
   817  
   818  // void setg_gcc(G*); set g called from gcc.
   819  TEXT setg_gcc<>(SB),NOSPLIT,$0
   820  	get_tls(AX)
   821  	MOVQ	DI, g(AX)
   822  	RET
   823  
   824  // check that SP is in range [g->stack.lo, g->stack.hi)
   825  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   826  	get_tls(CX)
   827  	MOVQ	g(CX), AX
   828  	CMPQ	(g_stack+stack_hi)(AX), SP
   829  	JHI	2(PC)
   830  	INT	$3
   831  	CMPQ	SP, (g_stack+stack_lo)(AX)
   832  	JHI	2(PC)
   833  	INT	$3
   834  	RET
   835  
   836  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   837  	MOVQ	argp+0(FP),AX		// addr of first arg
   838  	MOVQ	-8(AX),AX		// get calling pc
   839  	MOVQ	AX, ret+8(FP)
   840  	RET
   841  
   842  // func cputicks() int64
   843  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   844  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   845  	JNE	mfence
   846  	LFENCE
   847  	JMP	done
   848  mfence:
   849  	MFENCE
   850  done:
   851  	RDTSC
   852  	SHLQ	$32, DX
   853  	ADDQ	DX, AX
   854  	MOVQ	AX, ret+0(FP)
   855  	RET
   856  
   857  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   858  // redirects to memhash(p, h, size) using the size
   859  // stored in the closure.
   860  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   861  	GO_ARGS
   862  	NO_LOCAL_POINTERS
   863  	MOVQ	p+0(FP), AX
   864  	MOVQ	h+8(FP), BX
   865  	MOVQ	8(DX), CX
   866  	MOVQ	AX, 0(SP)
   867  	MOVQ	BX, 8(SP)
   868  	MOVQ	CX, 16(SP)
   869  	CALL	runtime·memhash(SB)
   870  	MOVQ	24(SP), AX
   871  	MOVQ	AX, ret+16(FP)
   872  	RET
   873  
   874  // hash function using AES hardware instructions
   875  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   876  	MOVQ	p+0(FP), AX	// ptr to data
   877  	MOVQ	s+16(FP), CX	// size
   878  	LEAQ	ret+24(FP), DX
   879  	JMP	runtime·aeshashbody(SB)
   880  
   881  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   882  	MOVQ	p+0(FP), AX	// ptr to string struct
   883  	MOVQ	8(AX), CX	// length of string
   884  	MOVQ	(AX), AX	// string data
   885  	LEAQ	ret+16(FP), DX
   886  	JMP	runtime·aeshashbody(SB)
   887  
   888  // AX: data
   889  // CX: length
   890  // DX: address to put return value
   891  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   892  	// Fill an SSE register with our seeds.
   893  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   894  	PINSRW	$4, CX, X0			// 16 bits of length
   895  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   896  	MOVO	X0, X1				// save unscrambled seed
   897  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   898  	AESENC	X0, X0				// scramble seed
   899  
   900  	CMPQ	CX, $16
   901  	JB	aes0to15
   902  	JE	aes16
   903  	CMPQ	CX, $32
   904  	JBE	aes17to32
   905  	CMPQ	CX, $64
   906  	JBE	aes33to64
   907  	CMPQ	CX, $128
   908  	JBE	aes65to128
   909  	JMP	aes129plus
   910  
   911  aes0to15:
   912  	TESTQ	CX, CX
   913  	JE	aes0
   914  
   915  	ADDQ	$16, AX
   916  	TESTW	$0xff0, AX
   917  	JE	endofpage
   918  
   919  	// 16 bytes loaded at this address won't cross
   920  	// a page boundary, so we can load it directly.
   921  	MOVOU	-16(AX), X1
   922  	ADDQ	CX, CX
   923  	MOVQ	$masks<>(SB), AX
   924  	PAND	(AX)(CX*8), X1
   925  final1:
   926  	PXOR	X0, X1	// xor data with seed
   927  	AESENC	X1, X1	// scramble combo 3 times
   928  	AESENC	X1, X1
   929  	AESENC	X1, X1
   930  	MOVQ	X1, (DX)
   931  	RET
   932  
   933  endofpage:
   934  	// address ends in 1111xxxx. Might be up against
   935  	// a page boundary, so load ending at last byte.
   936  	// Then shift bytes down using pshufb.
   937  	MOVOU	-32(AX)(CX*1), X1
   938  	ADDQ	CX, CX
   939  	MOVQ	$shifts<>(SB), AX
   940  	PSHUFB	(AX)(CX*8), X1
   941  	JMP	final1
   942  
   943  aes0:
   944  	// Return scrambled input seed
   945  	AESENC	X0, X0
   946  	MOVQ	X0, (DX)
   947  	RET
   948  
   949  aes16:
   950  	MOVOU	(AX), X1
   951  	JMP	final1
   952  
   953  aes17to32:
   954  	// make second starting seed
   955  	PXOR	runtime·aeskeysched+16(SB), X1
   956  	AESENC	X1, X1
   957  	
   958  	// load data to be hashed
   959  	MOVOU	(AX), X2
   960  	MOVOU	-16(AX)(CX*1), X3
   961  
   962  	// xor with seed
   963  	PXOR	X0, X2
   964  	PXOR	X1, X3
   965  
   966  	// scramble 3 times
   967  	AESENC	X2, X2
   968  	AESENC	X3, X3
   969  	AESENC	X2, X2
   970  	AESENC	X3, X3
   971  	AESENC	X2, X2
   972  	AESENC	X3, X3
   973  
   974  	// combine results
   975  	PXOR	X3, X2
   976  	MOVQ	X2, (DX)
   977  	RET
   978  
   979  aes33to64:
   980  	// make 3 more starting seeds
   981  	MOVO	X1, X2
   982  	MOVO	X1, X3
   983  	PXOR	runtime·aeskeysched+16(SB), X1
   984  	PXOR	runtime·aeskeysched+32(SB), X2
   985  	PXOR	runtime·aeskeysched+48(SB), X3
   986  	AESENC	X1, X1
   987  	AESENC	X2, X2
   988  	AESENC	X3, X3
   989  	
   990  	MOVOU	(AX), X4
   991  	MOVOU	16(AX), X5
   992  	MOVOU	-32(AX)(CX*1), X6
   993  	MOVOU	-16(AX)(CX*1), X7
   994  
   995  	PXOR	X0, X4
   996  	PXOR	X1, X5
   997  	PXOR	X2, X6
   998  	PXOR	X3, X7
   999  	
  1000  	AESENC	X4, X4
  1001  	AESENC	X5, X5
  1002  	AESENC	X6, X6
  1003  	AESENC	X7, X7
  1004  	
  1005  	AESENC	X4, X4
  1006  	AESENC	X5, X5
  1007  	AESENC	X6, X6
  1008  	AESENC	X7, X7
  1009  	
  1010  	AESENC	X4, X4
  1011  	AESENC	X5, X5
  1012  	AESENC	X6, X6
  1013  	AESENC	X7, X7
  1014  
  1015  	PXOR	X6, X4
  1016  	PXOR	X7, X5
  1017  	PXOR	X5, X4
  1018  	MOVQ	X4, (DX)
  1019  	RET
  1020  
  1021  aes65to128:
  1022  	// make 7 more starting seeds
  1023  	MOVO	X1, X2
  1024  	MOVO	X1, X3
  1025  	MOVO	X1, X4
  1026  	MOVO	X1, X5
  1027  	MOVO	X1, X6
  1028  	MOVO	X1, X7
  1029  	PXOR	runtime·aeskeysched+16(SB), X1
  1030  	PXOR	runtime·aeskeysched+32(SB), X2
  1031  	PXOR	runtime·aeskeysched+48(SB), X3
  1032  	PXOR	runtime·aeskeysched+64(SB), X4
  1033  	PXOR	runtime·aeskeysched+80(SB), X5
  1034  	PXOR	runtime·aeskeysched+96(SB), X6
  1035  	PXOR	runtime·aeskeysched+112(SB), X7
  1036  	AESENC	X1, X1
  1037  	AESENC	X2, X2
  1038  	AESENC	X3, X3
  1039  	AESENC	X4, X4
  1040  	AESENC	X5, X5
  1041  	AESENC	X6, X6
  1042  	AESENC	X7, X7
  1043  
  1044  	// load data
  1045  	MOVOU	(AX), X8
  1046  	MOVOU	16(AX), X9
  1047  	MOVOU	32(AX), X10
  1048  	MOVOU	48(AX), X11
  1049  	MOVOU	-64(AX)(CX*1), X12
  1050  	MOVOU	-48(AX)(CX*1), X13
  1051  	MOVOU	-32(AX)(CX*1), X14
  1052  	MOVOU	-16(AX)(CX*1), X15
  1053  
  1054  	// xor with seed
  1055  	PXOR	X0, X8
  1056  	PXOR	X1, X9
  1057  	PXOR	X2, X10
  1058  	PXOR	X3, X11
  1059  	PXOR	X4, X12
  1060  	PXOR	X5, X13
  1061  	PXOR	X6, X14
  1062  	PXOR	X7, X15
  1063  
  1064  	// scramble 3 times
  1065  	AESENC	X8, X8
  1066  	AESENC	X9, X9
  1067  	AESENC	X10, X10
  1068  	AESENC	X11, X11
  1069  	AESENC	X12, X12
  1070  	AESENC	X13, X13
  1071  	AESENC	X14, X14
  1072  	AESENC	X15, X15
  1073  
  1074  	AESENC	X8, X8
  1075  	AESENC	X9, X9
  1076  	AESENC	X10, X10
  1077  	AESENC	X11, X11
  1078  	AESENC	X12, X12
  1079  	AESENC	X13, X13
  1080  	AESENC	X14, X14
  1081  	AESENC	X15, X15
  1082  
  1083  	AESENC	X8, X8
  1084  	AESENC	X9, X9
  1085  	AESENC	X10, X10
  1086  	AESENC	X11, X11
  1087  	AESENC	X12, X12
  1088  	AESENC	X13, X13
  1089  	AESENC	X14, X14
  1090  	AESENC	X15, X15
  1091  
  1092  	// combine results
  1093  	PXOR	X12, X8
  1094  	PXOR	X13, X9
  1095  	PXOR	X14, X10
  1096  	PXOR	X15, X11
  1097  	PXOR	X10, X8
  1098  	PXOR	X11, X9
  1099  	PXOR	X9, X8
  1100  	MOVQ	X8, (DX)
  1101  	RET
  1102  
  1103  aes129plus:
  1104  	// make 7 more starting seeds
  1105  	MOVO	X1, X2
  1106  	MOVO	X1, X3
  1107  	MOVO	X1, X4
  1108  	MOVO	X1, X5
  1109  	MOVO	X1, X6
  1110  	MOVO	X1, X7
  1111  	PXOR	runtime·aeskeysched+16(SB), X1
  1112  	PXOR	runtime·aeskeysched+32(SB), X2
  1113  	PXOR	runtime·aeskeysched+48(SB), X3
  1114  	PXOR	runtime·aeskeysched+64(SB), X4
  1115  	PXOR	runtime·aeskeysched+80(SB), X5
  1116  	PXOR	runtime·aeskeysched+96(SB), X6
  1117  	PXOR	runtime·aeskeysched+112(SB), X7
  1118  	AESENC	X1, X1
  1119  	AESENC	X2, X2
  1120  	AESENC	X3, X3
  1121  	AESENC	X4, X4
  1122  	AESENC	X5, X5
  1123  	AESENC	X6, X6
  1124  	AESENC	X7, X7
  1125  	
  1126  	// start with last (possibly overlapping) block
  1127  	MOVOU	-128(AX)(CX*1), X8
  1128  	MOVOU	-112(AX)(CX*1), X9
  1129  	MOVOU	-96(AX)(CX*1), X10
  1130  	MOVOU	-80(AX)(CX*1), X11
  1131  	MOVOU	-64(AX)(CX*1), X12
  1132  	MOVOU	-48(AX)(CX*1), X13
  1133  	MOVOU	-32(AX)(CX*1), X14
  1134  	MOVOU	-16(AX)(CX*1), X15
  1135  
  1136  	// xor in seed
  1137  	PXOR	X0, X8
  1138  	PXOR	X1, X9
  1139  	PXOR	X2, X10
  1140  	PXOR	X3, X11
  1141  	PXOR	X4, X12
  1142  	PXOR	X5, X13
  1143  	PXOR	X6, X14
  1144  	PXOR	X7, X15
  1145  	
  1146  	// compute number of remaining 128-byte blocks
  1147  	DECQ	CX
  1148  	SHRQ	$7, CX
  1149  	
  1150  aesloop:
  1151  	// scramble state
  1152  	AESENC	X8, X8
  1153  	AESENC	X9, X9
  1154  	AESENC	X10, X10
  1155  	AESENC	X11, X11
  1156  	AESENC	X12, X12
  1157  	AESENC	X13, X13
  1158  	AESENC	X14, X14
  1159  	AESENC	X15, X15
  1160  
  1161  	// scramble state, xor in a block
  1162  	MOVOU	(AX), X0
  1163  	MOVOU	16(AX), X1
  1164  	MOVOU	32(AX), X2
  1165  	MOVOU	48(AX), X3
  1166  	AESENC	X0, X8
  1167  	AESENC	X1, X9
  1168  	AESENC	X2, X10
  1169  	AESENC	X3, X11
  1170  	MOVOU	64(AX), X4
  1171  	MOVOU	80(AX), X5
  1172  	MOVOU	96(AX), X6
  1173  	MOVOU	112(AX), X7
  1174  	AESENC	X4, X12
  1175  	AESENC	X5, X13
  1176  	AESENC	X6, X14
  1177  	AESENC	X7, X15
  1178  
  1179  	ADDQ	$128, AX
  1180  	DECQ	CX
  1181  	JNE	aesloop
  1182  
  1183  	// 3 more scrambles to finish
  1184  	AESENC	X8, X8
  1185  	AESENC	X9, X9
  1186  	AESENC	X10, X10
  1187  	AESENC	X11, X11
  1188  	AESENC	X12, X12
  1189  	AESENC	X13, X13
  1190  	AESENC	X14, X14
  1191  	AESENC	X15, X15
  1192  	AESENC	X8, X8
  1193  	AESENC	X9, X9
  1194  	AESENC	X10, X10
  1195  	AESENC	X11, X11
  1196  	AESENC	X12, X12
  1197  	AESENC	X13, X13
  1198  	AESENC	X14, X14
  1199  	AESENC	X15, X15
  1200  	AESENC	X8, X8
  1201  	AESENC	X9, X9
  1202  	AESENC	X10, X10
  1203  	AESENC	X11, X11
  1204  	AESENC	X12, X12
  1205  	AESENC	X13, X13
  1206  	AESENC	X14, X14
  1207  	AESENC	X15, X15
  1208  
  1209  	PXOR	X12, X8
  1210  	PXOR	X13, X9
  1211  	PXOR	X14, X10
  1212  	PXOR	X15, X11
  1213  	PXOR	X10, X8
  1214  	PXOR	X11, X9
  1215  	PXOR	X9, X8
  1216  	MOVQ	X8, (DX)
  1217  	RET
  1218  	
  1219  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1220  	MOVQ	p+0(FP), AX	// ptr to data
  1221  	MOVQ	h+8(FP), X0	// seed
  1222  	PINSRD	$2, (AX), X0	// data
  1223  	AESENC	runtime·aeskeysched+0(SB), X0
  1224  	AESENC	runtime·aeskeysched+16(SB), X0
  1225  	AESENC	runtime·aeskeysched+32(SB), X0
  1226  	MOVQ	X0, ret+16(FP)
  1227  	RET
  1228  
  1229  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1230  	MOVQ	p+0(FP), AX	// ptr to data
  1231  	MOVQ	h+8(FP), X0	// seed
  1232  	PINSRQ	$1, (AX), X0	// data
  1233  	AESENC	runtime·aeskeysched+0(SB), X0
  1234  	AESENC	runtime·aeskeysched+16(SB), X0
  1235  	AESENC	runtime·aeskeysched+32(SB), X0
  1236  	MOVQ	X0, ret+16(FP)
  1237  	RET
  1238  
  1239  // simple mask to get rid of data in the high part of the register.
  1240  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1241  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1242  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1243  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1244  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1245  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1246  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1247  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1248  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1249  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1250  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1251  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1252  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1253  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1254  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1255  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1256  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1257  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1258  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1259  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1260  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1261  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1262  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1263  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1264  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1265  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1266  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1267  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1268  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1269  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1270  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1271  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1272  GLOBL masks<>(SB),RODATA,$256
  1273  
  1274  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1275  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1276  	MOVQ	$masks<>(SB), AX
  1277  	MOVQ	$shifts<>(SB), BX
  1278  	ORQ	BX, AX
  1279  	TESTQ	$15, AX
  1280  	SETEQ	ret+0(FP)
  1281  	RET
  1282  
  1283  // these are arguments to pshufb. They move data down from
  1284  // the high bytes of the register to the low bytes of the register.
  1285  // index is how many bytes to move.
  1286  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1287  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1288  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1289  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1290  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1291  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1292  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1293  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1294  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1295  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1296  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1297  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1298  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1299  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1300  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1301  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1302  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1303  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1304  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1305  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1306  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1307  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1308  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1309  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1310  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1311  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1312  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1313  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1314  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1315  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1316  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1317  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1318  GLOBL shifts<>(SB),RODATA,$256
  1319  
  1320  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1321  TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1322  	MOVQ	a+0(FP), SI
  1323  	MOVQ	b+8(FP), DI
  1324  	CMPQ	SI, DI
  1325  	JEQ	eq
  1326  	MOVQ	size+16(FP), BX
  1327  	LEAQ	ret+24(FP), AX
  1328  	JMP	runtime·memeqbody(SB)
  1329  eq:
  1330  	MOVB	$1, ret+24(FP)
  1331  	RET
  1332  
  1333  // memequal_varlen(a, b unsafe.Pointer) bool
  1334  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1335  	MOVQ	a+0(FP), SI
  1336  	MOVQ	b+8(FP), DI
  1337  	CMPQ	SI, DI
  1338  	JEQ	eq
  1339  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1340  	LEAQ	ret+16(FP), AX
  1341  	JMP	runtime·memeqbody(SB)
  1342  eq:
  1343  	MOVB	$1, ret+16(FP)
  1344  	RET
  1345  
  1346  // eqstring tests whether two strings are equal.
  1347  // The compiler guarantees that strings passed
  1348  // to eqstring have equal length.
  1349  // See runtime_test.go:eqstring_generic for
  1350  // equivalent Go code.
  1351  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1352  	MOVQ	s1_base+0(FP), SI
  1353  	MOVQ	s2_base+16(FP), DI
  1354  	CMPQ	SI, DI
  1355  	JEQ	eq
  1356  	MOVQ	s1_len+8(FP), BX
  1357  	LEAQ	ret+32(FP), AX
  1358  	JMP	runtime·memeqbody(SB)
  1359  eq:
  1360  	MOVB	$1, ret+32(FP)
  1361  	RET
  1362  
  1363  // a in SI
  1364  // b in DI
  1365  // count in BX
  1366  // address of result byte in AX
  1367  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1368  	CMPQ	BX, $8
  1369  	JB	small
  1370  	CMPQ	BX, $64
  1371  	JB	bigloop
  1372  	CMPB    runtime·support_avx2(SB), $1
  1373  	JE	hugeloop_avx2
  1374  	
  1375  	// 64 bytes at a time using xmm registers
  1376  hugeloop:
  1377  	CMPQ	BX, $64
  1378  	JB	bigloop
  1379  	MOVOU	(SI), X0
  1380  	MOVOU	(DI), X1
  1381  	MOVOU	16(SI), X2
  1382  	MOVOU	16(DI), X3
  1383  	MOVOU	32(SI), X4
  1384  	MOVOU	32(DI), X5
  1385  	MOVOU	48(SI), X6
  1386  	MOVOU	48(DI), X7
  1387  	PCMPEQB	X1, X0
  1388  	PCMPEQB	X3, X2
  1389  	PCMPEQB	X5, X4
  1390  	PCMPEQB	X7, X6
  1391  	PAND	X2, X0
  1392  	PAND	X6, X4
  1393  	PAND	X4, X0
  1394  	PMOVMSKB X0, DX
  1395  	ADDQ	$64, SI
  1396  	ADDQ	$64, DI
  1397  	SUBQ	$64, BX
  1398  	CMPL	DX, $0xffff
  1399  	JEQ	hugeloop
  1400  	MOVB	$0, (AX)
  1401  	RET
  1402  
  1403  	// 64 bytes at a time using ymm registers
  1404  hugeloop_avx2:
  1405  	CMPQ	BX, $64
  1406  	JB	bigloop_avx2
  1407  	VMOVDQU	(SI), Y0
  1408  	VMOVDQU	(DI), Y1
  1409  	VMOVDQU	32(SI), Y2
  1410  	VMOVDQU	32(DI), Y3
  1411  	VPCMPEQB	Y1, Y0, Y4
  1412  	VPCMPEQB	Y2, Y3, Y5
  1413  	VPAND	Y4, Y5, Y6
  1414  	VPMOVMSKB Y6, DX
  1415  	ADDQ	$64, SI
  1416  	ADDQ	$64, DI
  1417  	SUBQ	$64, BX
  1418  	CMPL	DX, $0xffffffff
  1419  	JEQ	hugeloop_avx2
  1420  	VZEROUPPER
  1421  	MOVB	$0, (AX)
  1422  	RET
  1423  
  1424  bigloop_avx2:
  1425  	VZEROUPPER
  1426  
  1427  	// 8 bytes at a time using 64-bit register
  1428  bigloop:
  1429  	CMPQ	BX, $8
  1430  	JBE	leftover
  1431  	MOVQ	(SI), CX
  1432  	MOVQ	(DI), DX
  1433  	ADDQ	$8, SI
  1434  	ADDQ	$8, DI
  1435  	SUBQ	$8, BX
  1436  	CMPQ	CX, DX
  1437  	JEQ	bigloop
  1438  	MOVB	$0, (AX)
  1439  	RET
  1440  
  1441  	// remaining 0-8 bytes
  1442  leftover:
  1443  	MOVQ	-8(SI)(BX*1), CX
  1444  	MOVQ	-8(DI)(BX*1), DX
  1445  	CMPQ	CX, DX
  1446  	SETEQ	(AX)
  1447  	RET
  1448  
  1449  small:
  1450  	CMPQ	BX, $0
  1451  	JEQ	equal
  1452  
  1453  	LEAQ	0(BX*8), CX
  1454  	NEGQ	CX
  1455  
  1456  	CMPB	SI, $0xf8
  1457  	JA	si_high
  1458  
  1459  	// load at SI won't cross a page boundary.
  1460  	MOVQ	(SI), SI
  1461  	JMP	si_finish
  1462  si_high:
  1463  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1464  	MOVQ	-8(SI)(BX*1), SI
  1465  	SHRQ	CX, SI
  1466  si_finish:
  1467  
  1468  	// same for DI.
  1469  	CMPB	DI, $0xf8
  1470  	JA	di_high
  1471  	MOVQ	(DI), DI
  1472  	JMP	di_finish
  1473  di_high:
  1474  	MOVQ	-8(DI)(BX*1), DI
  1475  	SHRQ	CX, DI
  1476  di_finish:
  1477  
  1478  	SUBQ	SI, DI
  1479  	SHLQ	CX, DI
  1480  equal:
  1481  	SETEQ	(AX)
  1482  	RET
  1483  
  1484  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1485  	MOVQ	s1_base+0(FP), SI
  1486  	MOVQ	s1_len+8(FP), BX
  1487  	MOVQ	s2_base+16(FP), DI
  1488  	MOVQ	s2_len+24(FP), DX
  1489  	LEAQ	ret+32(FP), R9
  1490  	JMP	runtime·cmpbody(SB)
  1491  
  1492  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1493  	MOVQ	s1+0(FP), SI
  1494  	MOVQ	s1+8(FP), BX
  1495  	MOVQ	s2+24(FP), DI
  1496  	MOVQ	s2+32(FP), DX
  1497  	LEAQ	res+48(FP), R9
  1498  	JMP	runtime·cmpbody(SB)
  1499  
  1500  // input:
  1501  //   SI = a
  1502  //   DI = b
  1503  //   BX = alen
  1504  //   DX = blen
  1505  //   R9 = address of output word (stores -1/0/1 here)
  1506  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1507  	CMPQ	SI, DI
  1508  	JEQ	allsame
  1509  	CMPQ	BX, DX
  1510  	MOVQ	DX, R8
  1511  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1512  	CMPQ	R8, $8
  1513  	JB	small
  1514  
  1515  	CMPQ	R8, $63
  1516  	JBE	loop
  1517  	CMPB    runtime·support_avx2(SB), $1
  1518  	JEQ     big_loop_avx2
  1519  	JMP	big_loop
  1520  loop:
  1521  	CMPQ	R8, $16
  1522  	JBE	_0through16
  1523  	MOVOU	(SI), X0
  1524  	MOVOU	(DI), X1
  1525  	PCMPEQB X0, X1
  1526  	PMOVMSKB X1, AX
  1527  	XORQ	$0xffff, AX	// convert EQ to NE
  1528  	JNE	diff16	// branch if at least one byte is not equal
  1529  	ADDQ	$16, SI
  1530  	ADDQ	$16, DI
  1531  	SUBQ	$16, R8
  1532  	JMP	loop
  1533  	
  1534  diff64:
  1535  	ADDQ	$48, SI
  1536  	ADDQ	$48, DI
  1537  	JMP	diff16
  1538  diff48:
  1539  	ADDQ	$32, SI
  1540  	ADDQ	$32, DI
  1541  	JMP	diff16
  1542  diff32:
  1543  	ADDQ	$16, SI
  1544  	ADDQ	$16, DI
  1545  	// AX = bit mask of differences
  1546  diff16:
  1547  	BSFQ	AX, BX	// index of first byte that differs
  1548  	XORQ	AX, AX
  1549  	MOVB	(SI)(BX*1), CX
  1550  	CMPB	CX, (DI)(BX*1)
  1551  	SETHI	AX
  1552  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1553  	MOVQ	AX, (R9)
  1554  	RET
  1555  
  1556  	// 0 through 16 bytes left, alen>=8, blen>=8
  1557  _0through16:
  1558  	CMPQ	R8, $8
  1559  	JBE	_0through8
  1560  	MOVQ	(SI), AX
  1561  	MOVQ	(DI), CX
  1562  	CMPQ	AX, CX
  1563  	JNE	diff8
  1564  _0through8:
  1565  	MOVQ	-8(SI)(R8*1), AX
  1566  	MOVQ	-8(DI)(R8*1), CX
  1567  	CMPQ	AX, CX
  1568  	JEQ	allsame
  1569  
  1570  	// AX and CX contain parts of a and b that differ.
  1571  diff8:
  1572  	BSWAPQ	AX	// reverse order of bytes
  1573  	BSWAPQ	CX
  1574  	XORQ	AX, CX
  1575  	BSRQ	CX, CX	// index of highest bit difference
  1576  	SHRQ	CX, AX	// move a's bit to bottom
  1577  	ANDQ	$1, AX	// mask bit
  1578  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1579  	MOVQ	AX, (R9)
  1580  	RET
  1581  
  1582  	// 0-7 bytes in common
  1583  small:
  1584  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1585  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1586  	JEQ	allsame
  1587  
  1588  	// load bytes of a into high bytes of AX
  1589  	CMPB	SI, $0xf8
  1590  	JA	si_high
  1591  	MOVQ	(SI), SI
  1592  	JMP	si_finish
  1593  si_high:
  1594  	MOVQ	-8(SI)(R8*1), SI
  1595  	SHRQ	CX, SI
  1596  si_finish:
  1597  	SHLQ	CX, SI
  1598  
  1599  	// load bytes of b in to high bytes of BX
  1600  	CMPB	DI, $0xf8
  1601  	JA	di_high
  1602  	MOVQ	(DI), DI
  1603  	JMP	di_finish
  1604  di_high:
  1605  	MOVQ	-8(DI)(R8*1), DI
  1606  	SHRQ	CX, DI
  1607  di_finish:
  1608  	SHLQ	CX, DI
  1609  
  1610  	BSWAPQ	SI	// reverse order of bytes
  1611  	BSWAPQ	DI
  1612  	XORQ	SI, DI	// find bit differences
  1613  	JEQ	allsame
  1614  	BSRQ	DI, CX	// index of highest bit difference
  1615  	SHRQ	CX, SI	// move a's bit to bottom
  1616  	ANDQ	$1, SI	// mask bit
  1617  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1618  	MOVQ	AX, (R9)
  1619  	RET
  1620  
  1621  allsame:
  1622  	XORQ	AX, AX
  1623  	XORQ	CX, CX
  1624  	CMPQ	BX, DX
  1625  	SETGT	AX	// 1 if alen > blen
  1626  	SETEQ	CX	// 1 if alen == blen
  1627  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1628  	MOVQ	AX, (R9)
  1629  	RET
  1630  
  1631  	// this works for >= 64 bytes of data.
  1632  big_loop:
  1633  	MOVOU	(SI), X0
  1634  	MOVOU	(DI), X1
  1635  	PCMPEQB X0, X1
  1636  	PMOVMSKB X1, AX
  1637  	XORQ	$0xffff, AX
  1638  	JNE	diff16
  1639  
  1640  	MOVOU	16(SI), X0
  1641  	MOVOU	16(DI), X1
  1642  	PCMPEQB X0, X1
  1643  	PMOVMSKB X1, AX
  1644  	XORQ	$0xffff, AX
  1645  	JNE	diff32
  1646  
  1647  	MOVOU	32(SI), X0
  1648  	MOVOU	32(DI), X1
  1649  	PCMPEQB X0, X1
  1650  	PMOVMSKB X1, AX
  1651  	XORQ	$0xffff, AX
  1652  	JNE	diff48
  1653  
  1654  	MOVOU	48(SI), X0
  1655  	MOVOU	48(DI), X1
  1656  	PCMPEQB X0, X1
  1657  	PMOVMSKB X1, AX
  1658  	XORQ	$0xffff, AX
  1659  	JNE	diff64
  1660  
  1661  	ADDQ	$64, SI
  1662  	ADDQ	$64, DI
  1663  	SUBQ	$64, R8
  1664  	CMPQ	R8, $64
  1665  	JBE	loop
  1666  	JMP	big_loop
  1667  
  1668  	// Compare 64-bytes per loop iteration.
  1669  	// Loop is unrolled and uses AVX2.
  1670  big_loop_avx2:
  1671  	VMOVDQU	(SI), Y2
  1672  	VMOVDQU	(DI), Y3
  1673  	VMOVDQU	32(SI), Y4
  1674  	VMOVDQU	32(DI), Y5
  1675  	VPCMPEQB Y2, Y3, Y0
  1676  	VPMOVMSKB Y0, AX
  1677  	XORL	$0xffffffff, AX
  1678  	JNE	diff32_avx2
  1679  	VPCMPEQB Y4, Y5, Y6
  1680  	VPMOVMSKB Y6, AX
  1681  	XORL	$0xffffffff, AX
  1682  	JNE	diff64_avx2
  1683  
  1684  	ADDQ	$64, SI
  1685  	ADDQ	$64, DI
  1686  	SUBQ	$64, R8
  1687  	CMPQ	R8, $64
  1688  	JB	big_loop_avx2_exit
  1689  	JMP	big_loop_avx2
  1690  
  1691  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1692  diff32_avx2:
  1693  	VZEROUPPER
  1694  	JMP diff16
  1695  
  1696  	// Same as diff32_avx2, but for last 32 bytes.
  1697  diff64_avx2:
  1698  	VZEROUPPER
  1699  	JMP diff48
  1700  
  1701  	// For <64 bytes remainder jump to normal loop.
  1702  big_loop_avx2_exit:
  1703  	VZEROUPPER
  1704  	JMP loop
  1705  
  1706  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1707  	MOVQ s+0(FP), DI
  1708  	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1709  	MOVQ s_len+8(FP), DX
  1710  	MOVQ c+16(FP), BP
  1711  	MOVQ c_len+24(FP), AX
  1712  	MOVQ DI, R10
  1713  	LEAQ ret+32(FP), R11
  1714  	JMP  runtime·indexShortStr(SB)
  1715  
  1716  TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
  1717  	MOVQ s+0(FP), DI
  1718  	MOVQ s_len+8(FP), DX
  1719  	MOVQ c+24(FP), BP
  1720  	MOVQ c_len+32(FP), AX
  1721  	MOVQ DI, R10
  1722  	LEAQ ret+48(FP), R11
  1723  	JMP  runtime·indexShortStr(SB)
  1724  
  1725  // AX: length of string, that we are searching for
  1726  // DX: length of string, in which we are searching
  1727  // DI: pointer to string, in which we are searching
  1728  // BP: pointer to string, that we are searching for
  1729  // R11: address, where to put return value
  1730  TEXT runtime·indexShortStr(SB),NOSPLIT,$0
  1731  	CMPQ AX, DX
  1732  	JA fail
  1733  	CMPQ DX, $16
  1734  	JAE sse42
  1735  no_sse42:
  1736  	CMPQ AX, $2
  1737  	JA   _3_or_more
  1738  	MOVW (BP), BP
  1739  	LEAQ -1(DI)(DX*1), DX
  1740  loop2:
  1741  	MOVW (DI), SI
  1742  	CMPW SI,BP
  1743  	JZ success
  1744  	ADDQ $1,DI
  1745  	CMPQ DI,DX
  1746  	JB loop2
  1747  	JMP fail
  1748  _3_or_more:
  1749  	CMPQ AX, $3
  1750  	JA   _4_or_more
  1751  	MOVW 1(BP), BX
  1752  	MOVW (BP), BP
  1753  	LEAQ -2(DI)(DX*1), DX
  1754  loop3:
  1755  	MOVW (DI), SI
  1756  	CMPW SI,BP
  1757  	JZ   partial_success3
  1758  	ADDQ $1,DI
  1759  	CMPQ DI,DX
  1760  	JB loop3
  1761  	JMP fail
  1762  partial_success3:
  1763  	MOVW 1(DI), SI
  1764  	CMPW SI,BX
  1765  	JZ success
  1766  	ADDQ $1,DI
  1767  	CMPQ DI,DX
  1768  	JB loop3
  1769  	JMP fail
  1770  _4_or_more:
  1771  	CMPQ AX, $4
  1772  	JA   _5_or_more
  1773  	MOVL (BP), BP
  1774  	LEAQ -3(DI)(DX*1), DX
  1775  loop4:
  1776  	MOVL (DI), SI
  1777  	CMPL SI,BP
  1778  	JZ   success
  1779  	ADDQ $1,DI
  1780  	CMPQ DI,DX
  1781  	JB loop4
  1782  	JMP fail
  1783  _5_or_more:
  1784  	CMPQ AX, $7
  1785  	JA   _8_or_more
  1786  	LEAQ 1(DI)(DX*1), DX
  1787  	SUBQ AX, DX
  1788  	MOVL -4(BP)(AX*1), BX
  1789  	MOVL (BP), BP
  1790  loop5to7:
  1791  	MOVL (DI), SI
  1792  	CMPL SI,BP
  1793  	JZ   partial_success5to7
  1794  	ADDQ $1,DI
  1795  	CMPQ DI,DX
  1796  	JB loop5to7
  1797  	JMP fail
  1798  partial_success5to7:
  1799  	MOVL -4(AX)(DI*1), SI
  1800  	CMPL SI,BX
  1801  	JZ success
  1802  	ADDQ $1,DI
  1803  	CMPQ DI,DX
  1804  	JB loop5to7
  1805  	JMP fail
  1806  _8_or_more:
  1807  	CMPQ AX, $8
  1808  	JA   _9_or_more
  1809  	MOVQ (BP), BP
  1810  	LEAQ -7(DI)(DX*1), DX
  1811  loop8:
  1812  	MOVQ (DI), SI
  1813  	CMPQ SI,BP
  1814  	JZ   success
  1815  	ADDQ $1,DI
  1816  	CMPQ DI,DX
  1817  	JB loop8
  1818  	JMP fail
  1819  _9_or_more:
  1820  	CMPQ AX, $15
  1821  	JA   _16_or_more
  1822  	LEAQ 1(DI)(DX*1), DX
  1823  	SUBQ AX, DX
  1824  	MOVQ -8(BP)(AX*1), BX
  1825  	MOVQ (BP), BP
  1826  loop9to15:
  1827  	MOVQ (DI), SI
  1828  	CMPQ SI,BP
  1829  	JZ   partial_success9to15
  1830  	ADDQ $1,DI
  1831  	CMPQ DI,DX
  1832  	JB loop9to15
  1833  	JMP fail
  1834  partial_success9to15:
  1835  	MOVQ -8(AX)(DI*1), SI
  1836  	CMPQ SI,BX
  1837  	JZ success
  1838  	ADDQ $1,DI
  1839  	CMPQ DI,DX
  1840  	JB loop9to15
  1841  	JMP fail
  1842  _16_or_more:
  1843  	CMPQ AX, $16
  1844  	JA   _17_or_more
  1845  	MOVOU (BP), X1
  1846  	LEAQ -15(DI)(DX*1), DX
  1847  loop16:
  1848  	MOVOU (DI), X2
  1849  	PCMPEQB X1, X2
  1850  	PMOVMSKB X2, SI
  1851  	CMPQ  SI, $0xffff
  1852  	JE   success
  1853  	ADDQ $1,DI
  1854  	CMPQ DI,DX
  1855  	JB loop16
  1856  	JMP fail
  1857  _17_or_more:
  1858  	CMPQ AX, $31
  1859  	JA   _32_or_more
  1860  	LEAQ 1(DI)(DX*1), DX
  1861  	SUBQ AX, DX
  1862  	MOVOU -16(BP)(AX*1), X0
  1863  	MOVOU (BP), X1
  1864  loop17to31:
  1865  	MOVOU (DI), X2
  1866  	PCMPEQB X1,X2
  1867  	PMOVMSKB X2, SI
  1868  	CMPQ  SI, $0xffff
  1869  	JE   partial_success17to31
  1870  	ADDQ $1,DI
  1871  	CMPQ DI,DX
  1872  	JB loop17to31
  1873  	JMP fail
  1874  partial_success17to31:
  1875  	MOVOU -16(AX)(DI*1), X3
  1876  	PCMPEQB X0, X3
  1877  	PMOVMSKB X3, SI
  1878  	CMPQ  SI, $0xffff
  1879  	JE success
  1880  	ADDQ $1,DI
  1881  	CMPQ DI,DX
  1882  	JB loop17to31
  1883  	JMP fail
  1884  // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
  1885  // So no need to check cpuid
  1886  _32_or_more:
  1887  	CMPQ AX, $32
  1888  	JA   _33_to_63
  1889  	VMOVDQU (BP), Y1
  1890  	LEAQ -31(DI)(DX*1), DX
  1891  loop32:
  1892  	VMOVDQU (DI), Y2
  1893  	VPCMPEQB Y1, Y2, Y3
  1894  	VPMOVMSKB Y3, SI
  1895  	CMPL  SI, $0xffffffff
  1896  	JE   success_avx2
  1897  	ADDQ $1,DI
  1898  	CMPQ DI,DX
  1899  	JB loop32
  1900  	JMP fail_avx2
  1901  _33_to_63:
  1902  	LEAQ 1(DI)(DX*1), DX
  1903  	SUBQ AX, DX
  1904  	VMOVDQU -32(BP)(AX*1), Y0
  1905  	VMOVDQU (BP), Y1
  1906  loop33to63:
  1907  	VMOVDQU (DI), Y2
  1908  	VPCMPEQB Y1, Y2, Y3
  1909  	VPMOVMSKB Y3, SI
  1910  	CMPL  SI, $0xffffffff
  1911  	JE   partial_success33to63
  1912  	ADDQ $1,DI
  1913  	CMPQ DI,DX
  1914  	JB loop33to63
  1915  	JMP fail_avx2
  1916  partial_success33to63:
  1917  	VMOVDQU -32(AX)(DI*1), Y3
  1918  	VPCMPEQB Y0, Y3, Y4
  1919  	VPMOVMSKB Y4, SI
  1920  	CMPL  SI, $0xffffffff
  1921  	JE success_avx2
  1922  	ADDQ $1,DI
  1923  	CMPQ DI,DX
  1924  	JB loop33to63
  1925  fail_avx2:
  1926  	VZEROUPPER
  1927  fail:
  1928  	MOVQ $-1, (R11)
  1929  	RET
  1930  success_avx2:
  1931  	VZEROUPPER
  1932  	JMP success
  1933  sse42:
  1934  	CMPB runtime·support_sse42(SB), $1
  1935  	JNE no_sse42
  1936  	CMPQ AX, $12
  1937  	// PCMPESTRI is slower than normal compare,
  1938  	// so using it makes sense only if we advance 4+ bytes per compare
  1939  	// This value was determined experimentally and is the ~same
  1940  	// on Nehalem (first with SSE42) and Haswell.
  1941  	JAE _9_or_more
  1942  	LEAQ 16(BP), SI
  1943  	TESTW $0xff0, SI
  1944  	JEQ no_sse42
  1945  	MOVOU (BP), X1
  1946  	LEAQ -15(DI)(DX*1), SI
  1947  	MOVQ $16, R9
  1948  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1949  loop_sse42:
  1950  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1951  	// for equality (bits 2,3 are 11)
  1952  	// result is not masked or inverted (bits 4,5 are 00)
  1953  	// and corresponds to first matching byte (bit 6 is 0)
  1954  	PCMPESTRI $0x0c, (DI), X1
  1955  	// CX == 16 means no match,
  1956  	// CX > R9 means partial match at the end of the string,
  1957  	// otherwise sep is at offset CX from X1 start
  1958  	CMPQ CX, R9
  1959  	JBE sse42_success
  1960  	ADDQ R9, DI
  1961  	CMPQ DI, SI
  1962  	JB loop_sse42
  1963  	PCMPESTRI $0x0c, -1(SI), X1
  1964  	CMPQ CX, R9
  1965  	JA fail
  1966  	LEAQ -1(SI), DI
  1967  sse42_success:
  1968  	ADDQ CX, DI
  1969  success:
  1970  	SUBQ R10, DI
  1971  	MOVQ DI, (R11)
  1972  	RET
  1973  
  1974  
  1975  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1976  	MOVQ s+0(FP), SI
  1977  	MOVQ s_len+8(FP), BX
  1978  	MOVB c+24(FP), AL
  1979  	LEAQ ret+32(FP), R8
  1980  	JMP  runtime·indexbytebody(SB)
  1981  
  1982  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1983  	MOVQ s+0(FP), SI
  1984  	MOVQ s_len+8(FP), BX
  1985  	MOVB c+16(FP), AL
  1986  	LEAQ ret+24(FP), R8
  1987  	JMP  runtime·indexbytebody(SB)
  1988  
  1989  // input:
  1990  //   SI: data
  1991  //   BX: data len
  1992  //   AL: byte sought
  1993  //   R8: address to put result
  1994  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1995  	// Shuffle X0 around so that each byte contains
  1996  	// the character we're looking for.
  1997  	MOVD AX, X0
  1998  	PUNPCKLBW X0, X0
  1999  	PUNPCKLBW X0, X0
  2000  	PSHUFL $0, X0, X0
  2001  	
  2002  	CMPQ BX, $16
  2003  	JLT small
  2004  
  2005  	MOVQ SI, DI
  2006  
  2007  	CMPQ BX, $32
  2008  	JA avx2
  2009  sse:
  2010  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2011  	JMP	sseloopentry
  2012  	
  2013  sseloop:
  2014  	// Move the next 16-byte chunk of the data into X1.
  2015  	MOVOU	(DI), X1
  2016  	// Compare bytes in X0 to X1.
  2017  	PCMPEQB	X0, X1
  2018  	// Take the top bit of each byte in X1 and put the result in DX.
  2019  	PMOVMSKB X1, DX
  2020  	// Find first set bit, if any.
  2021  	BSFL	DX, DX
  2022  	JNZ	ssesuccess
  2023  	// Advance to next block.
  2024  	ADDQ	$16, DI
  2025  sseloopentry:
  2026  	CMPQ	DI, AX
  2027  	JB	sseloop
  2028  
  2029  	// Search the last 16-byte chunk. This chunk may overlap with the
  2030  	// chunks we've already searched, but that's ok.
  2031  	MOVQ	AX, DI
  2032  	MOVOU	(AX), X1
  2033  	PCMPEQB	X0, X1
  2034  	PMOVMSKB X1, DX
  2035  	BSFL	DX, DX
  2036  	JNZ	ssesuccess
  2037  
  2038  failure:
  2039  	MOVQ $-1, (R8)
  2040  	RET
  2041  
  2042  // We've found a chunk containing the byte.
  2043  // The chunk was loaded from DI.
  2044  // The index of the matching byte in the chunk is DX.
  2045  // The start of the data is SI.
  2046  ssesuccess:
  2047  	SUBQ SI, DI	// Compute offset of chunk within data.
  2048  	ADDQ DX, DI	// Add offset of byte within chunk.
  2049  	MOVQ DI, (R8)
  2050  	RET
  2051  
  2052  // handle for lengths < 16
  2053  small:
  2054  	TESTQ	BX, BX
  2055  	JEQ	failure
  2056  
  2057  	// Check if we'll load across a page boundary.
  2058  	LEAQ	16(SI), AX
  2059  	TESTW	$0xff0, AX
  2060  	JEQ	endofpage
  2061  
  2062  	MOVOU	(SI), X1 // Load data
  2063  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2064  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2065  	BSFL	DX, DX	// Find first set bit.
  2066  	JZ	failure	// No set bit, failure.
  2067  	CMPL	DX, BX
  2068  	JAE	failure	// Match is past end of data.
  2069  	MOVQ	DX, (R8)
  2070  	RET
  2071  
  2072  endofpage:
  2073  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2074  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2075  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2076  	MOVL	BX, CX
  2077  	SHLL	CX, DX
  2078  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2079  	BSFL	DX, DX	// Find first set bit.
  2080  	JZ	failure	// No set bit, failure.
  2081  	MOVQ	DX, (R8)
  2082  	RET
  2083  
  2084  avx2:
  2085  	CMPB   runtime·support_avx2(SB), $1
  2086  	JNE sse
  2087  	MOVD AX, X0
  2088  	LEAQ -32(SI)(BX*1), R11
  2089  	VPBROADCASTB  X0, Y1
  2090  avx2_loop:
  2091  	VMOVDQU (DI), Y2
  2092  	VPCMPEQB Y1, Y2, Y3
  2093  	VPTEST Y3, Y3
  2094  	JNZ avx2success
  2095  	ADDQ $32, DI
  2096  	CMPQ DI, R11
  2097  	JLT avx2_loop
  2098  	MOVQ R11, DI
  2099  	VMOVDQU (DI), Y2
  2100  	VPCMPEQB Y1, Y2, Y3
  2101  	VPTEST Y3, Y3
  2102  	JNZ avx2success
  2103  	VZEROUPPER
  2104  	MOVQ $-1, (R8)
  2105  	RET
  2106  
  2107  avx2success:
  2108  	VPMOVMSKB Y3, DX
  2109  	BSFL DX, DX
  2110  	SUBQ SI, DI
  2111  	ADDQ DI, DX
  2112  	MOVQ DX, (R8)
  2113  	VZEROUPPER
  2114  	RET
  2115  
  2116  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2117  	MOVQ	a_len+8(FP), BX
  2118  	MOVQ	b_len+32(FP), CX
  2119  	CMPQ	BX, CX
  2120  	JNE	eqret
  2121  	MOVQ	a+0(FP), SI
  2122  	MOVQ	b+24(FP), DI
  2123  	LEAQ	ret+48(FP), AX
  2124  	JMP	runtime·memeqbody(SB)
  2125  eqret:
  2126  	MOVB	$0, ret+48(FP)
  2127  	RET
  2128  
  2129  
  2130  TEXT bytes·countByte(SB),NOSPLIT,$0-40
  2131  	MOVQ s+0(FP), SI
  2132  	MOVQ s_len+8(FP), BX
  2133  	MOVB c+24(FP), AL
  2134  	LEAQ ret+32(FP), R8
  2135  	JMP  runtime·countByte(SB)
  2136  
  2137  TEXT strings·countByte(SB),NOSPLIT,$0-32
  2138  	MOVQ s+0(FP), SI
  2139  	MOVQ s_len+8(FP), BX
  2140  	MOVB c+16(FP), AL
  2141  	LEAQ ret+24(FP), R8
  2142  	JMP  runtime·countByte(SB)
  2143  
  2144  // input:
  2145  //   SI: data
  2146  //   BX: data len
  2147  //   AL: byte sought
  2148  //   R8: address to put result
  2149  // This requires the POPCNT instruction
  2150  TEXT runtime·countByte(SB),NOSPLIT,$0
  2151  	// Shuffle X0 around so that each byte contains
  2152  	// the character we're looking for.
  2153  	MOVD AX, X0
  2154  	PUNPCKLBW X0, X0
  2155  	PUNPCKLBW X0, X0
  2156  	PSHUFL $0, X0, X0
  2157  
  2158  	CMPQ BX, $16
  2159  	JLT small
  2160  
  2161  	MOVQ $0, R12 // Accumulator
  2162  
  2163  	MOVQ SI, DI
  2164  
  2165  	CMPQ BX, $32
  2166  	JA avx2
  2167  sse:
  2168  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2169  	JMP	sseloopentry
  2170  
  2171  sseloop:
  2172  	// Move the next 16-byte chunk of the data into X1.
  2173  	MOVOU	(DI), X1
  2174  	// Compare bytes in X0 to X1.
  2175  	PCMPEQB	X0, X1
  2176  	// Take the top bit of each byte in X1 and put the result in DX.
  2177  	PMOVMSKB X1, DX
  2178  	// Count number of matching bytes
  2179  	POPCNTL DX, DX
  2180  	// Accumulate into R12
  2181  	ADDQ DX, R12
  2182  	// Advance to next block.
  2183  	ADDQ	$16, DI
  2184  sseloopentry:
  2185  	CMPQ	DI, AX
  2186  	JBE	sseloop
  2187  
  2188  	// Get the number of bytes to consider in the last 16 bytes
  2189  	ANDQ $15, BX
  2190  	JZ end
  2191  
  2192  	// Create mask to ignore overlap between previous 16 byte block
  2193  	// and the next.
  2194  	MOVQ $16,CX
  2195  	SUBQ BX, CX
  2196  	MOVQ $0xFFFF, R10
  2197  	SARQ CL, R10
  2198  	SALQ CL, R10
  2199  
  2200  	// Process the last 16-byte chunk. This chunk may overlap with the
  2201  	// chunks we've already searched so we need to mask part of it.
  2202  	MOVOU	(AX), X1
  2203  	PCMPEQB	X0, X1
  2204  	PMOVMSKB X1, DX
  2205  	// Apply mask
  2206  	ANDQ R10, DX
  2207  	POPCNTL DX, DX
  2208  	ADDQ DX, R12
  2209  end:
  2210  	MOVQ R12, (R8)
  2211  	RET
  2212  
  2213  // handle for lengths < 16
  2214  small:
  2215  	TESTQ	BX, BX
  2216  	JEQ	endzero
  2217  
  2218  	// Check if we'll load across a page boundary.
  2219  	LEAQ	16(SI), AX
  2220  	TESTW	$0xff0, AX
  2221  	JEQ	endofpage
  2222  
  2223  	// We must ignore high bytes as they aren't part of our slice.
  2224  	// Create mask.
  2225  	MOVB BX, CX
  2226  	MOVQ $1, R10
  2227  	SALQ CL, R10
  2228  	SUBQ $1, R10
  2229  
  2230  	// Load data
  2231  	MOVOU	(SI), X1
  2232  	// Compare target byte with each byte in data.
  2233  	PCMPEQB	X0, X1
  2234  	// Move result bits to integer register.
  2235  	PMOVMSKB X1, DX
  2236  	// Apply mask
  2237  	ANDQ R10, DX
  2238  	POPCNTL DX, DX
  2239  	// Directly return DX, we don't need to accumulate
  2240  	// since we have <16 bytes.
  2241  	MOVQ	DX, (R8)
  2242  	RET
  2243  endzero:
  2244  	MOVQ $0, (R8)
  2245  	RET
  2246  
  2247  endofpage:
  2248  	// We must ignore low bytes as they aren't part of our slice.
  2249  	MOVQ $16,CX
  2250  	SUBQ BX, CX
  2251  	MOVQ $0xFFFF, R10
  2252  	SARQ CL, R10
  2253  	SALQ CL, R10
  2254  
  2255  	// Load data into the high end of X1.
  2256  	MOVOU	-16(SI)(BX*1), X1
  2257  	// Compare target byte with each byte in data.
  2258  	PCMPEQB	X0, X1
  2259  	// Move result bits to integer register.
  2260  	PMOVMSKB X1, DX
  2261  	// Apply mask
  2262  	ANDQ R10, DX
  2263  	// Directly return DX, we don't need to accumulate
  2264  	// since we have <16 bytes.
  2265  	POPCNTL DX, DX
  2266  	MOVQ	DX, (R8)
  2267  	RET
  2268  
  2269  avx2:
  2270  	CMPB   runtime·support_avx2(SB), $1
  2271  	JNE sse
  2272  	MOVD AX, X0
  2273  	LEAQ -32(SI)(BX*1), R11
  2274  	VPBROADCASTB  X0, Y1
  2275  avx2_loop:
  2276  	VMOVDQU (DI), Y2
  2277  	VPCMPEQB Y1, Y2, Y3
  2278  	VPMOVMSKB Y3, DX
  2279  	POPCNTL DX, DX
  2280  	ADDQ DX, R12
  2281  	ADDQ $32, DI
  2282  	CMPQ DI, R11
  2283  	JLE avx2_loop
  2284  
  2285  	// If last block is already processed,
  2286  	// skip to the end.
  2287  	CMPQ DI, R11
  2288  	JEQ endavx
  2289  
  2290  	// Load address of the last 32 bytes.
  2291  	// There is an overlap with the previous block.
  2292  	MOVQ R11, DI
  2293  	VMOVDQU (DI), Y2
  2294  	VPCMPEQB Y1, Y2, Y3
  2295  	VPMOVMSKB Y3, DX
  2296  	// Exit AVX mode.
  2297  	VZEROUPPER
  2298  
  2299  	// Create mask to ignore overlap between previous 32 byte block
  2300  	// and the next.
  2301  	ANDQ $31, BX
  2302  	MOVQ $32,CX
  2303  	SUBQ BX, CX
  2304  	MOVQ $0xFFFFFFFF, R10
  2305  	SARQ CL, R10
  2306  	SALQ CL, R10
  2307  	// Apply mask
  2308  	ANDQ R10, DX
  2309  	POPCNTL DX, DX
  2310  	ADDQ DX, R12
  2311  	MOVQ R12, (R8)
  2312  	RET
  2313  endavx:
  2314  	// Exit AVX mode.
  2315  	VZEROUPPER
  2316  	MOVQ R12, (R8)
  2317  	RET
  2318  
  2319  TEXT runtime·return0(SB), NOSPLIT, $0
  2320  	MOVL	$0, AX
  2321  	RET
  2322  
  2323  
  2324  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2325  // Must obey the gcc calling convention.
  2326  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2327  	get_tls(CX)
  2328  	MOVQ	g(CX), AX
  2329  	MOVQ	g_m(AX), AX
  2330  	MOVQ	m_curg(AX), AX
  2331  	MOVQ	(g_stack+stack_hi)(AX), AX
  2332  	RET
  2333  
  2334  // The top-most function running on a goroutine
  2335  // returns to goexit+PCQuantum.
  2336  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2337  	BYTE	$0x90	// NOP
  2338  	CALL	runtime·goexit1(SB)	// does not return
  2339  	// traceback from goexit1 must hit code range of goexit
  2340  	BYTE	$0x90	// NOP
  2341  
  2342  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2343  	MOVQ	addr+0(FP), AX
  2344  	PREFETCHT0	(AX)
  2345  	RET
  2346  
  2347  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2348  	MOVQ	addr+0(FP), AX
  2349  	PREFETCHT1	(AX)
  2350  	RET
  2351  
  2352  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2353  	MOVQ	addr+0(FP), AX
  2354  	PREFETCHT2	(AX)
  2355  	RET
  2356  
  2357  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2358  	MOVQ	addr+0(FP), AX
  2359  	PREFETCHNTA	(AX)
  2360  	RET
  2361  
  2362  // This is called from .init_array and follows the platform, not Go, ABI.
  2363  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2364  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2365  	MOVQ	runtime·lastmoduledatap(SB), AX
  2366  	MOVQ	DI, moduledata_next(AX)
  2367  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2368  	POPQ	R15
  2369  	RET