github.com/s1s1ty/go@v0.0.0-20180207192209-104445e3140f/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  // _rt0_amd64 is common startup code for most amd64 systems when using
    11  // internal linking. This is the entry point for the program from the
    12  // kernel for an ordinary -buildmode=exe program. The stack holds the
    13  // number of arguments and the C-style argv.
    14  TEXT _rt0_amd64(SB),NOSPLIT,$-8
    15  	MOVQ	0(SP), DI	// argc
    16  	LEAQ	8(SP), SI	// argv
    17  	JMP	runtime·rt0_go(SB)
    18  
    19  // main is common startup code for most amd64 systems when using
    20  // external linking. The C startup code will call the symbol "main"
    21  // passing argc and argv in the usual C ABI registers DI and SI.
    22  TEXT main(SB),NOSPLIT,$-8
    23  	JMP	runtime·rt0_go(SB)
    24  
    25  // _rt0_amd64_lib is common startup code for most amd64 systems when
    26  // using -buildmode=c-archive or -buildmode=c-shared. The linker will
    27  // arrange to invoke this function as a global constructor (for
    28  // c-archive) or when the shared library is loaded (for c-shared).
    29  // We expect argc and argv to be passed in the usual C ABI registers
    30  // DI and SI.
    31  TEXT _rt0_amd64_lib(SB),NOSPLIT,$0x50
    32  	// Align stack per ELF ABI requirements.
    33  	MOVQ	SP, AX
    34  	ANDQ	$~15, SP
    35  	// Save C ABI callee-saved registers, as caller may need them.
    36  	MOVQ	BX, 0x10(SP)
    37  	MOVQ	BP, 0x18(SP)
    38  	MOVQ	R12, 0x20(SP)
    39  	MOVQ	R13, 0x28(SP)
    40  	MOVQ	R14, 0x30(SP)
    41  	MOVQ	R15, 0x38(SP)
    42  	MOVQ	AX, 0x40(SP)
    43  
    44  	MOVQ	DI, _rt0_amd64_lib_argc<>(SB)
    45  	MOVQ	SI, _rt0_amd64_lib_argv<>(SB)
    46  
    47  	// Synchronous initialization.
    48  	CALL	runtime·libpreinit(SB)
    49  
    50  	// Create a new thread to finish Go runtime initialization.
    51  	MOVQ	_cgo_sys_thread_create(SB), AX
    52  	TESTQ	AX, AX
    53  	JZ	nocgo
    54  	MOVQ	$_rt0_amd64_lib_go(SB), DI
    55  	MOVQ	$0, SI
    56  	CALL	AX
    57  	JMP	restore
    58  
    59  nocgo:
    60  	MOVQ	$0x800000, 0(SP)		// stacksize
    61  	MOVQ	$_rt0_amd64_lib_go(SB), AX
    62  	MOVQ	AX, 8(SP)			// fn
    63  	CALL	runtime·newosproc0(SB)
    64  
    65  restore:
    66  	MOVQ	0x10(SP), BX
    67  	MOVQ	0x18(SP), BP
    68  	MOVQ	0x20(SP), R12
    69  	MOVQ	0x28(SP), R13
    70  	MOVQ	0x30(SP), R14
    71  	MOVQ	0x38(SP), R15
    72  	MOVQ	0x40(SP), SP
    73  	RET
    74  
    75  // _rt0_amd64_lib_go initializes the Go runtime.
    76  // This is started in a separate thread by _rt0_amd64_lib.
    77  TEXT _rt0_amd64_lib_go(SB),NOSPLIT,$0
    78  	MOVQ	_rt0_amd64_lib_argc<>(SB), DI
    79  	MOVQ	_rt0_amd64_lib_argv<>(SB), SI
    80  	JMP	runtime·rt0_go(SB)
    81  
    82  DATA _rt0_amd64_lib_argc<>(SB)/8, $0
    83  GLOBL _rt0_amd64_lib_argc<>(SB),NOPTR, $8
    84  DATA _rt0_amd64_lib_argv<>(SB)/8, $0
    85  GLOBL _rt0_amd64_lib_argv<>(SB),NOPTR, $8
    86  
    87  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    88  	// copy arguments forward on an even stack
    89  	MOVQ	DI, AX		// argc
    90  	MOVQ	SI, BX		// argv
    91  	SUBQ	$(4*8+7), SP		// 2args 2auto
    92  	ANDQ	$~15, SP
    93  	MOVQ	AX, 16(SP)
    94  	MOVQ	BX, 24(SP)
    95  	
    96  	// create istack out of the given (operating system) stack.
    97  	// _cgo_init may update stackguard.
    98  	MOVQ	$runtime·g0(SB), DI
    99  	LEAQ	(-64*1024+104)(SP), BX
   100  	MOVQ	BX, g_stackguard0(DI)
   101  	MOVQ	BX, g_stackguard1(DI)
   102  	MOVQ	BX, (g_stack+stack_lo)(DI)
   103  	MOVQ	SP, (g_stack+stack_hi)(DI)
   104  
   105  	// find out information about the processor we're on
   106  	MOVL	$0, AX
   107  	CPUID
   108  	MOVL	AX, SI
   109  	CMPL	AX, $0
   110  	JE	nocpuinfo
   111  
   112  	// Figure out how to serialize RDTSC.
   113  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
   114  	// Don't know about the rest, so let's do MFENCE.
   115  	CMPL	BX, $0x756E6547  // "Genu"
   116  	JNE	notintel
   117  	CMPL	DX, $0x49656E69  // "ineI"
   118  	JNE	notintel
   119  	CMPL	CX, $0x6C65746E  // "ntel"
   120  	JNE	notintel
   121  	MOVB	$1, runtime·isIntel(SB)
   122  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
   123  notintel:
   124  
   125  	// Load EAX=1 cpuid flags
   126  	MOVL	$1, AX
   127  	CPUID
   128  	MOVL	AX, runtime·processorVersionInfo(SB)
   129  
   130  	TESTL	$(1<<26), DX // SSE2
   131  	SETNE	runtime·support_sse2(SB)
   132  
   133  	TESTL	$(1<<9), CX // SSSE3
   134  	SETNE	runtime·support_ssse3(SB)
   135  
   136  	TESTL	$(1<<19), CX // SSE4.1
   137  	SETNE	runtime·support_sse41(SB)
   138  
   139  	TESTL	$(1<<20), CX // SSE4.2
   140  	SETNE	runtime·support_sse42(SB)
   141  
   142  	TESTL	$(1<<23), CX // POPCNT
   143  	SETNE	runtime·support_popcnt(SB)
   144  
   145  	TESTL	$(1<<25), CX // AES
   146  	SETNE	runtime·support_aes(SB)
   147  
   148  	TESTL	$(1<<27), CX // OSXSAVE
   149  	SETNE	runtime·support_osxsave(SB)
   150  
   151  	// If OS support for XMM and YMM is not present
   152  	// support_avx will be set back to false later.
   153  	TESTL	$(1<<28), CX // AVX
   154  	SETNE	runtime·support_avx(SB)
   155  
   156  eax7:
   157  	// Load EAX=7/ECX=0 cpuid flags
   158  	CMPL	SI, $7
   159  	JLT	osavx
   160  	MOVL	$7, AX
   161  	MOVL	$0, CX
   162  	CPUID
   163  
   164  	TESTL	$(1<<3), BX // BMI1
   165  	SETNE	runtime·support_bmi1(SB)
   166  
   167  	// If OS support for XMM and YMM is not present
   168  	// support_avx2 will be set back to false later.
   169  	TESTL	$(1<<5), BX
   170  	SETNE	runtime·support_avx2(SB)
   171  
   172  	TESTL	$(1<<8), BX // BMI2
   173  	SETNE	runtime·support_bmi2(SB)
   174  
   175  	TESTL	$(1<<9), BX // ERMS
   176  	SETNE	runtime·support_erms(SB)
   177  
   178  osavx:
   179  	CMPB	runtime·support_osxsave(SB), $1
   180  	JNE	noavx
   181  	MOVL	$0, CX
   182  	// For XGETBV, OSXSAVE bit is required and sufficient
   183  	XGETBV
   184  	ANDL	$6, AX
   185  	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
   186  	JE nocpuinfo
   187  noavx:
   188  	MOVB $0, runtime·support_avx(SB)
   189  	MOVB $0, runtime·support_avx2(SB)
   190  
   191  nocpuinfo:
   192  	// if there is an _cgo_init, call it.
   193  	MOVQ	_cgo_init(SB), AX
   194  	TESTQ	AX, AX
   195  	JZ	needtls
   196  	// g0 already in DI
   197  	MOVQ	DI, CX	// Win64 uses CX for first parameter
   198  	MOVQ	$setg_gcc<>(SB), SI
   199  	CALL	AX
   200  
   201  	// update stackguard after _cgo_init
   202  	MOVQ	$runtime·g0(SB), CX
   203  	MOVQ	(g_stack+stack_lo)(CX), AX
   204  	ADDQ	$const__StackGuard, AX
   205  	MOVQ	AX, g_stackguard0(CX)
   206  	MOVQ	AX, g_stackguard1(CX)
   207  
   208  #ifndef GOOS_windows
   209  	JMP ok
   210  #endif
   211  needtls:
   212  #ifdef GOOS_plan9
   213  	// skip TLS setup on Plan 9
   214  	JMP ok
   215  #endif
   216  #ifdef GOOS_solaris
   217  	// skip TLS setup on Solaris
   218  	JMP ok
   219  #endif
   220  
   221  	LEAQ	runtime·m0+m_tls(SB), DI
   222  	CALL	runtime·settls(SB)
   223  
   224  	// store through it, to make sure it works
   225  	get_tls(BX)
   226  	MOVQ	$0x123, g(BX)
   227  	MOVQ	runtime·m0+m_tls(SB), AX
   228  	CMPQ	AX, $0x123
   229  	JEQ 2(PC)
   230  	MOVL	AX, 0	// abort
   231  ok:
   232  	// set the per-goroutine and per-mach "registers"
   233  	get_tls(BX)
   234  	LEAQ	runtime·g0(SB), CX
   235  	MOVQ	CX, g(BX)
   236  	LEAQ	runtime·m0(SB), AX
   237  
   238  	// save m->g0 = g0
   239  	MOVQ	CX, m_g0(AX)
   240  	// save m0 to g0->m
   241  	MOVQ	AX, g_m(CX)
   242  
   243  	CLD				// convention is D is always left cleared
   244  	CALL	runtime·check(SB)
   245  
   246  	MOVL	16(SP), AX		// copy argc
   247  	MOVL	AX, 0(SP)
   248  	MOVQ	24(SP), AX		// copy argv
   249  	MOVQ	AX, 8(SP)
   250  	CALL	runtime·args(SB)
   251  	CALL	runtime·osinit(SB)
   252  	CALL	runtime·schedinit(SB)
   253  
   254  	// create a new goroutine to start program
   255  	MOVQ	$runtime·mainPC(SB), AX		// entry
   256  	PUSHQ	AX
   257  	PUSHQ	$0			// arg size
   258  	CALL	runtime·newproc(SB)
   259  	POPQ	AX
   260  	POPQ	AX
   261  
   262  	// start this M
   263  	CALL	runtime·mstart(SB)
   264  
   265  	MOVL	$0xf1, 0xf1  // crash
   266  	RET
   267  
   268  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   269  GLOBL	runtime·mainPC(SB),RODATA,$8
   270  
   271  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   272  	BYTE	$0xcc
   273  	RET
   274  
   275  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   276  	// No per-thread init.
   277  	RET
   278  
   279  /*
   280   *  go-routine
   281   */
   282  
   283  // void gosave(Gobuf*)
   284  // save state in Gobuf; setjmp
   285  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   286  	MOVQ	buf+0(FP), AX		// gobuf
   287  	LEAQ	buf+0(FP), BX		// caller's SP
   288  	MOVQ	BX, gobuf_sp(AX)
   289  	MOVQ	0(SP), BX		// caller's PC
   290  	MOVQ	BX, gobuf_pc(AX)
   291  	MOVQ	$0, gobuf_ret(AX)
   292  	MOVQ	BP, gobuf_bp(AX)
   293  	// Assert ctxt is zero. See func save.
   294  	MOVQ	gobuf_ctxt(AX), BX
   295  	TESTQ	BX, BX
   296  	JZ	2(PC)
   297  	CALL	runtime·badctxt(SB)
   298  	get_tls(CX)
   299  	MOVQ	g(CX), BX
   300  	MOVQ	BX, gobuf_g(AX)
   301  	RET
   302  
   303  // void gogo(Gobuf*)
   304  // restore state from Gobuf; longjmp
   305  TEXT runtime·gogo(SB), NOSPLIT, $16-8
   306  	MOVQ	buf+0(FP), BX		// gobuf
   307  	MOVQ	gobuf_g(BX), DX
   308  	MOVQ	0(DX), CX		// make sure g != nil
   309  	get_tls(CX)
   310  	MOVQ	DX, g(CX)
   311  	MOVQ	gobuf_sp(BX), SP	// restore SP
   312  	MOVQ	gobuf_ret(BX), AX
   313  	MOVQ	gobuf_ctxt(BX), DX
   314  	MOVQ	gobuf_bp(BX), BP
   315  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   316  	MOVQ	$0, gobuf_ret(BX)
   317  	MOVQ	$0, gobuf_ctxt(BX)
   318  	MOVQ	$0, gobuf_bp(BX)
   319  	MOVQ	gobuf_pc(BX), BX
   320  	JMP	BX
   321  
   322  // func mcall(fn func(*g))
   323  // Switch to m->g0's stack, call fn(g).
   324  // Fn must never return. It should gogo(&g->sched)
   325  // to keep running g.
   326  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   327  	MOVQ	fn+0(FP), DI
   328  	
   329  	get_tls(CX)
   330  	MOVQ	g(CX), AX	// save state in g->sched
   331  	MOVQ	0(SP), BX	// caller's PC
   332  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   333  	LEAQ	fn+0(FP), BX	// caller's SP
   334  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   335  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   336  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   337  
   338  	// switch to m->g0 & its stack, call fn
   339  	MOVQ	g(CX), BX
   340  	MOVQ	g_m(BX), BX
   341  	MOVQ	m_g0(BX), SI
   342  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   343  	JNE	3(PC)
   344  	MOVQ	$runtime·badmcall(SB), AX
   345  	JMP	AX
   346  	MOVQ	SI, g(CX)	// g = m->g0
   347  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   348  	PUSHQ	AX
   349  	MOVQ	DI, DX
   350  	MOVQ	0(DI), DI
   351  	CALL	DI
   352  	POPQ	AX
   353  	MOVQ	$runtime·badmcall2(SB), AX
   354  	JMP	AX
   355  	RET
   356  
   357  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   358  // of the G stack. We need to distinguish the routine that
   359  // lives at the bottom of the G stack from the one that lives
   360  // at the top of the system stack because the one at the top of
   361  // the system stack terminates the stack walk (see topofstack()).
   362  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   363  	RET
   364  
   365  // func systemstack(fn func())
   366  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   367  	MOVQ	fn+0(FP), DI	// DI = fn
   368  	get_tls(CX)
   369  	MOVQ	g(CX), AX	// AX = g
   370  	MOVQ	g_m(AX), BX	// BX = m
   371  
   372  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   373  	CMPQ	AX, DX
   374  	JEQ	noswitch
   375  
   376  	MOVQ	m_g0(BX), DX	// DX = g0
   377  	CMPQ	AX, DX
   378  	JEQ	noswitch
   379  
   380  	MOVQ	m_curg(BX), R8
   381  	CMPQ	AX, R8
   382  	JEQ	switch
   383  	
   384  	// Bad: g is not gsignal, not g0, not curg. What is it?
   385  	MOVQ	$runtime·badsystemstack(SB), AX
   386  	CALL	AX
   387  
   388  switch:
   389  	// save our state in g->sched. Pretend to
   390  	// be systemstack_switch if the G stack is scanned.
   391  	MOVQ	$runtime·systemstack_switch(SB), SI
   392  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   393  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   394  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   395  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   396  
   397  	// switch to g0
   398  	MOVQ	DX, g(CX)
   399  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   400  	// make it look like mstart called systemstack on g0, to stop traceback
   401  	SUBQ	$8, BX
   402  	MOVQ	$runtime·mstart(SB), DX
   403  	MOVQ	DX, 0(BX)
   404  	MOVQ	BX, SP
   405  
   406  	// call target function
   407  	MOVQ	DI, DX
   408  	MOVQ	0(DI), DI
   409  	CALL	DI
   410  
   411  	// switch back to g
   412  	get_tls(CX)
   413  	MOVQ	g(CX), AX
   414  	MOVQ	g_m(AX), BX
   415  	MOVQ	m_curg(BX), AX
   416  	MOVQ	AX, g(CX)
   417  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   418  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   419  	RET
   420  
   421  noswitch:
   422  	// already on m stack; tail call the function
   423  	// Using a tail call here cleans up tracebacks since we won't stop
   424  	// at an intermediate systemstack.
   425  	MOVQ	DI, DX
   426  	MOVQ	0(DI), DI
   427  	JMP	DI
   428  
   429  /*
   430   * support for morestack
   431   */
   432  
   433  // Called during function prolog when more stack is needed.
   434  //
   435  // The traceback routines see morestack on a g0 as being
   436  // the top of a stack (for example, morestack calling newstack
   437  // calling the scheduler calling newm calling gc), so we must
   438  // record an argument size. For that purpose, it has no arguments.
   439  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   440  	// Cannot grow scheduler stack (m->g0).
   441  	get_tls(CX)
   442  	MOVQ	g(CX), BX
   443  	MOVQ	g_m(BX), BX
   444  	MOVQ	m_g0(BX), SI
   445  	CMPQ	g(CX), SI
   446  	JNE	3(PC)
   447  	CALL	runtime·badmorestackg0(SB)
   448  	INT	$3
   449  
   450  	// Cannot grow signal stack (m->gsignal).
   451  	MOVQ	m_gsignal(BX), SI
   452  	CMPQ	g(CX), SI
   453  	JNE	3(PC)
   454  	CALL	runtime·badmorestackgsignal(SB)
   455  	INT	$3
   456  
   457  	// Called from f.
   458  	// Set m->morebuf to f's caller.
   459  	MOVQ	8(SP), AX	// f's caller's PC
   460  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   461  	LEAQ	16(SP), AX	// f's caller's SP
   462  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   463  	get_tls(CX)
   464  	MOVQ	g(CX), SI
   465  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   466  
   467  	// Set g->sched to context in f.
   468  	MOVQ	0(SP), AX // f's PC
   469  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   470  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   471  	LEAQ	8(SP), AX // f's SP
   472  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   473  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   474  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   475  
   476  	// Call newstack on m->g0's stack.
   477  	MOVQ	m_g0(BX), BX
   478  	MOVQ	BX, g(CX)
   479  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   480  	CALL	runtime·newstack(SB)
   481  	MOVQ	$0, 0x1003	// crash if newstack returns
   482  	RET
   483  
   484  // morestack but not preserving ctxt.
   485  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   486  	MOVL	$0, DX
   487  	JMP	runtime·morestack(SB)
   488  
   489  // reflectcall: call a function with the given argument list
   490  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   491  // we don't have variable-sized frames, so we use a small number
   492  // of constant-sized-frame functions to encode a few bits of size in the pc.
   493  // Caution: ugly multiline assembly macros in your future!
   494  
   495  #define DISPATCH(NAME,MAXSIZE)		\
   496  	CMPQ	CX, $MAXSIZE;		\
   497  	JA	3(PC);			\
   498  	MOVQ	$NAME(SB), AX;		\
   499  	JMP	AX
   500  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   501  
   502  TEXT reflect·call(SB), NOSPLIT, $0-0
   503  	JMP	·reflectcall(SB)
   504  
   505  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   506  	MOVLQZX argsize+24(FP), CX
   507  	DISPATCH(runtime·call32, 32)
   508  	DISPATCH(runtime·call64, 64)
   509  	DISPATCH(runtime·call128, 128)
   510  	DISPATCH(runtime·call256, 256)
   511  	DISPATCH(runtime·call512, 512)
   512  	DISPATCH(runtime·call1024, 1024)
   513  	DISPATCH(runtime·call2048, 2048)
   514  	DISPATCH(runtime·call4096, 4096)
   515  	DISPATCH(runtime·call8192, 8192)
   516  	DISPATCH(runtime·call16384, 16384)
   517  	DISPATCH(runtime·call32768, 32768)
   518  	DISPATCH(runtime·call65536, 65536)
   519  	DISPATCH(runtime·call131072, 131072)
   520  	DISPATCH(runtime·call262144, 262144)
   521  	DISPATCH(runtime·call524288, 524288)
   522  	DISPATCH(runtime·call1048576, 1048576)
   523  	DISPATCH(runtime·call2097152, 2097152)
   524  	DISPATCH(runtime·call4194304, 4194304)
   525  	DISPATCH(runtime·call8388608, 8388608)
   526  	DISPATCH(runtime·call16777216, 16777216)
   527  	DISPATCH(runtime·call33554432, 33554432)
   528  	DISPATCH(runtime·call67108864, 67108864)
   529  	DISPATCH(runtime·call134217728, 134217728)
   530  	DISPATCH(runtime·call268435456, 268435456)
   531  	DISPATCH(runtime·call536870912, 536870912)
   532  	DISPATCH(runtime·call1073741824, 1073741824)
   533  	MOVQ	$runtime·badreflectcall(SB), AX
   534  	JMP	AX
   535  
   536  #define CALLFN(NAME,MAXSIZE)			\
   537  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   538  	NO_LOCAL_POINTERS;			\
   539  	/* copy arguments to stack */		\
   540  	MOVQ	argptr+16(FP), SI;		\
   541  	MOVLQZX argsize+24(FP), CX;		\
   542  	MOVQ	SP, DI;				\
   543  	REP;MOVSB;				\
   544  	/* call function */			\
   545  	MOVQ	f+8(FP), DX;			\
   546  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   547  	CALL	(DX);				\
   548  	/* copy return values back */		\
   549  	MOVQ	argtype+0(FP), DX;		\
   550  	MOVQ	argptr+16(FP), DI;		\
   551  	MOVLQZX	argsize+24(FP), CX;		\
   552  	MOVLQZX	retoffset+28(FP), BX;		\
   553  	MOVQ	SP, SI;				\
   554  	ADDQ	BX, DI;				\
   555  	ADDQ	BX, SI;				\
   556  	SUBQ	BX, CX;				\
   557  	CALL	callRet<>(SB);			\
   558  	RET
   559  
   560  // callRet copies return values back at the end of call*. This is a
   561  // separate function so it can allocate stack space for the arguments
   562  // to reflectcallmove. It does not follow the Go ABI; it expects its
   563  // arguments in registers.
   564  TEXT callRet<>(SB), NOSPLIT, $32-0
   565  	NO_LOCAL_POINTERS
   566  	MOVQ	DX, 0(SP)
   567  	MOVQ	DI, 8(SP)
   568  	MOVQ	SI, 16(SP)
   569  	MOVQ	CX, 24(SP)
   570  	CALL	runtime·reflectcallmove(SB)
   571  	RET
   572  
   573  CALLFN(·call32, 32)
   574  CALLFN(·call64, 64)
   575  CALLFN(·call128, 128)
   576  CALLFN(·call256, 256)
   577  CALLFN(·call512, 512)
   578  CALLFN(·call1024, 1024)
   579  CALLFN(·call2048, 2048)
   580  CALLFN(·call4096, 4096)
   581  CALLFN(·call8192, 8192)
   582  CALLFN(·call16384, 16384)
   583  CALLFN(·call32768, 32768)
   584  CALLFN(·call65536, 65536)
   585  CALLFN(·call131072, 131072)
   586  CALLFN(·call262144, 262144)
   587  CALLFN(·call524288, 524288)
   588  CALLFN(·call1048576, 1048576)
   589  CALLFN(·call2097152, 2097152)
   590  CALLFN(·call4194304, 4194304)
   591  CALLFN(·call8388608, 8388608)
   592  CALLFN(·call16777216, 16777216)
   593  CALLFN(·call33554432, 33554432)
   594  CALLFN(·call67108864, 67108864)
   595  CALLFN(·call134217728, 134217728)
   596  CALLFN(·call268435456, 268435456)
   597  CALLFN(·call536870912, 536870912)
   598  CALLFN(·call1073741824, 1073741824)
   599  
   600  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   601  	MOVL	cycles+0(FP), AX
   602  again:
   603  	PAUSE
   604  	SUBL	$1, AX
   605  	JNZ	again
   606  	RET
   607  
   608  
   609  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   610  	// Stores are already ordered on x86, so this is just a
   611  	// compile barrier.
   612  	RET
   613  
   614  // void jmpdefer(fn, sp);
   615  // called from deferreturn.
   616  // 1. pop the caller
   617  // 2. sub 5 bytes from the callers return
   618  // 3. jmp to the argument
   619  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   620  	MOVQ	fv+0(FP), DX	// fn
   621  	MOVQ	argp+8(FP), BX	// caller sp
   622  	LEAQ	-8(BX), SP	// caller sp after CALL
   623  	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   624  	SUBQ	$5, (SP)	// return to CALL again
   625  	MOVQ	0(DX), BX
   626  	JMP	BX	// but first run the deferred function
   627  
   628  // Save state of caller into g->sched. Smashes R8, R9.
   629  TEXT gosave<>(SB),NOSPLIT,$0
   630  	get_tls(R8)
   631  	MOVQ	g(R8), R8
   632  	MOVQ	0(SP), R9
   633  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   634  	LEAQ	8(SP), R9
   635  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   636  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   637  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   638  	// Assert ctxt is zero. See func save.
   639  	MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   640  	TESTQ	R9, R9
   641  	JZ	2(PC)
   642  	CALL	runtime·badctxt(SB)
   643  	RET
   644  
   645  // func asmcgocall(fn, arg unsafe.Pointer) int32
   646  // Call fn(arg) on the scheduler stack,
   647  // aligned appropriately for the gcc ABI.
   648  // See cgocall.go for more details.
   649  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   650  	MOVQ	fn+0(FP), AX
   651  	MOVQ	arg+8(FP), BX
   652  
   653  	MOVQ	SP, DX
   654  
   655  	// Figure out if we need to switch to m->g0 stack.
   656  	// We get called to create new OS threads too, and those
   657  	// come in on the m->g0 stack already.
   658  	get_tls(CX)
   659  	MOVQ	g(CX), R8
   660  	CMPQ	R8, $0
   661  	JEQ	nosave
   662  	MOVQ	g_m(R8), R8
   663  	MOVQ	m_g0(R8), SI
   664  	MOVQ	g(CX), DI
   665  	CMPQ	SI, DI
   666  	JEQ	nosave
   667  	MOVQ	m_gsignal(R8), SI
   668  	CMPQ	SI, DI
   669  	JEQ	nosave
   670  	
   671  	// Switch to system stack.
   672  	MOVQ	m_g0(R8), SI
   673  	CALL	gosave<>(SB)
   674  	MOVQ	SI, g(CX)
   675  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   676  
   677  	// Now on a scheduling stack (a pthread-created stack).
   678  	// Make sure we have enough room for 4 stack-backed fast-call
   679  	// registers as per windows amd64 calling convention.
   680  	SUBQ	$64, SP
   681  	ANDQ	$~15, SP	// alignment for gcc ABI
   682  	MOVQ	DI, 48(SP)	// save g
   683  	MOVQ	(g_stack+stack_hi)(DI), DI
   684  	SUBQ	DX, DI
   685  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   686  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   687  	MOVQ	BX, CX		// CX = first argument in Win64
   688  	CALL	AX
   689  
   690  	// Restore registers, g, stack pointer.
   691  	get_tls(CX)
   692  	MOVQ	48(SP), DI
   693  	MOVQ	(g_stack+stack_hi)(DI), SI
   694  	SUBQ	40(SP), SI
   695  	MOVQ	DI, g(CX)
   696  	MOVQ	SI, SP
   697  
   698  	MOVL	AX, ret+16(FP)
   699  	RET
   700  
   701  nosave:
   702  	// Running on a system stack, perhaps even without a g.
   703  	// Having no g can happen during thread creation or thread teardown
   704  	// (see needm/dropm on Solaris, for example).
   705  	// This code is like the above sequence but without saving/restoring g
   706  	// and without worrying about the stack moving out from under us
   707  	// (because we're on a system stack, not a goroutine stack).
   708  	// The above code could be used directly if already on a system stack,
   709  	// but then the only path through this code would be a rare case on Solaris.
   710  	// Using this code for all "already on system stack" calls exercises it more,
   711  	// which should help keep it correct.
   712  	SUBQ	$64, SP
   713  	ANDQ	$~15, SP
   714  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   715  	MOVQ	DX, 40(SP)	// save original stack pointer
   716  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   717  	MOVQ	BX, CX		// CX = first argument in Win64
   718  	CALL	AX
   719  	MOVQ	40(SP), SI	// restore original stack pointer
   720  	MOVQ	SI, SP
   721  	MOVL	AX, ret+16(FP)
   722  	RET
   723  
   724  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   725  // Turn the fn into a Go func (by taking its address) and call
   726  // cgocallback_gofunc.
   727  TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   728  	LEAQ	fn+0(FP), AX
   729  	MOVQ	AX, 0(SP)
   730  	MOVQ	frame+8(FP), AX
   731  	MOVQ	AX, 8(SP)
   732  	MOVQ	framesize+16(FP), AX
   733  	MOVQ	AX, 16(SP)
   734  	MOVQ	ctxt+24(FP), AX
   735  	MOVQ	AX, 24(SP)
   736  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   737  	CALL	AX
   738  	RET
   739  
   740  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   741  // See cgocall.go for more details.
   742  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   743  	NO_LOCAL_POINTERS
   744  
   745  	// If g is nil, Go did not create the current thread.
   746  	// Call needm to obtain one m for temporary use.
   747  	// In this case, we're running on the thread stack, so there's
   748  	// lots of space, but the linker doesn't know. Hide the call from
   749  	// the linker analysis by using an indirect call through AX.
   750  	get_tls(CX)
   751  #ifdef GOOS_windows
   752  	MOVL	$0, BX
   753  	CMPQ	CX, $0
   754  	JEQ	2(PC)
   755  #endif
   756  	MOVQ	g(CX), BX
   757  	CMPQ	BX, $0
   758  	JEQ	needm
   759  	MOVQ	g_m(BX), BX
   760  	MOVQ	BX, R8 // holds oldm until end of function
   761  	JMP	havem
   762  needm:
   763  	MOVQ	$0, 0(SP)
   764  	MOVQ	$runtime·needm(SB), AX
   765  	CALL	AX
   766  	MOVQ	0(SP), R8
   767  	get_tls(CX)
   768  	MOVQ	g(CX), BX
   769  	MOVQ	g_m(BX), BX
   770  	
   771  	// Set m->sched.sp = SP, so that if a panic happens
   772  	// during the function we are about to execute, it will
   773  	// have a valid SP to run on the g0 stack.
   774  	// The next few lines (after the havem label)
   775  	// will save this SP onto the stack and then write
   776  	// the same SP back to m->sched.sp. That seems redundant,
   777  	// but if an unrecovered panic happens, unwindm will
   778  	// restore the g->sched.sp from the stack location
   779  	// and then systemstack will try to use it. If we don't set it here,
   780  	// that restored SP will be uninitialized (typically 0) and
   781  	// will not be usable.
   782  	MOVQ	m_g0(BX), SI
   783  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   784  
   785  havem:
   786  	// Now there's a valid m, and we're running on its m->g0.
   787  	// Save current m->g0->sched.sp on stack and then set it to SP.
   788  	// Save current sp in m->g0->sched.sp in preparation for
   789  	// switch back to m->curg stack.
   790  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   791  	MOVQ	m_g0(BX), SI
   792  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   793  	MOVQ	AX, 0(SP)
   794  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   795  
   796  	// Switch to m->curg stack and call runtime.cgocallbackg.
   797  	// Because we are taking over the execution of m->curg
   798  	// but *not* resuming what had been running, we need to
   799  	// save that information (m->curg->sched) so we can restore it.
   800  	// We can restore m->curg->sched.sp easily, because calling
   801  	// runtime.cgocallbackg leaves SP unchanged upon return.
   802  	// To save m->curg->sched.pc, we push it onto the stack.
   803  	// This has the added benefit that it looks to the traceback
   804  	// routine like cgocallbackg is going to return to that
   805  	// PC (because the frame we allocate below has the same
   806  	// size as cgocallback_gofunc's frame declared above)
   807  	// so that the traceback will seamlessly trace back into
   808  	// the earlier calls.
   809  	//
   810  	// In the new goroutine, 8(SP) holds the saved R8.
   811  	MOVQ	m_curg(BX), SI
   812  	MOVQ	SI, g(CX)
   813  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   814  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   815  	MOVQ	BX, -8(DI)
   816  	// Compute the size of the frame, including return PC and, if
   817  	// GOEXPERIMENT=framepointer, the saved base pointer
   818  	MOVQ	ctxt+24(FP), BX
   819  	LEAQ	fv+0(FP), AX
   820  	SUBQ	SP, AX
   821  	SUBQ	AX, DI
   822  	MOVQ	DI, SP
   823  
   824  	MOVQ	R8, 8(SP)
   825  	MOVQ	BX, 0(SP)
   826  	CALL	runtime·cgocallbackg(SB)
   827  	MOVQ	8(SP), R8
   828  
   829  	// Compute the size of the frame again. FP and SP have
   830  	// completely different values here than they did above,
   831  	// but only their difference matters.
   832  	LEAQ	fv+0(FP), AX
   833  	SUBQ	SP, AX
   834  
   835  	// Restore g->sched (== m->curg->sched) from saved values.
   836  	get_tls(CX)
   837  	MOVQ	g(CX), SI
   838  	MOVQ	SP, DI
   839  	ADDQ	AX, DI
   840  	MOVQ	-8(DI), BX
   841  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   842  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   843  
   844  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   845  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   846  	// so we do not have to restore it.)
   847  	MOVQ	g(CX), BX
   848  	MOVQ	g_m(BX), BX
   849  	MOVQ	m_g0(BX), SI
   850  	MOVQ	SI, g(CX)
   851  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   852  	MOVQ	0(SP), AX
   853  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   854  	
   855  	// If the m on entry was nil, we called needm above to borrow an m
   856  	// for the duration of the call. Since the call is over, return it with dropm.
   857  	CMPQ	R8, $0
   858  	JNE 3(PC)
   859  	MOVQ	$runtime·dropm(SB), AX
   860  	CALL	AX
   861  
   862  	// Done!
   863  	RET
   864  
   865  // void setg(G*); set g. for use by needm.
   866  TEXT runtime·setg(SB), NOSPLIT, $0-8
   867  	MOVQ	gg+0(FP), BX
   868  #ifdef GOOS_windows
   869  	CMPQ	BX, $0
   870  	JNE	settls
   871  	MOVQ	$0, 0x28(GS)
   872  	RET
   873  settls:
   874  	MOVQ	g_m(BX), AX
   875  	LEAQ	m_tls(AX), AX
   876  	MOVQ	AX, 0x28(GS)
   877  #endif
   878  	get_tls(CX)
   879  	MOVQ	BX, g(CX)
   880  	RET
   881  
   882  // void setg_gcc(G*); set g called from gcc.
   883  TEXT setg_gcc<>(SB),NOSPLIT,$0
   884  	get_tls(AX)
   885  	MOVQ	DI, g(AX)
   886  	RET
   887  
   888  // check that SP is in range [g->stack.lo, g->stack.hi)
   889  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   890  	get_tls(CX)
   891  	MOVQ	g(CX), AX
   892  	CMPQ	(g_stack+stack_hi)(AX), SP
   893  	JHI	2(PC)
   894  	INT	$3
   895  	CMPQ	SP, (g_stack+stack_lo)(AX)
   896  	JHI	2(PC)
   897  	INT	$3
   898  	RET
   899  
   900  // func cputicks() int64
   901  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   902  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   903  	JNE	mfence
   904  	LFENCE
   905  	JMP	done
   906  mfence:
   907  	MFENCE
   908  done:
   909  	RDTSC
   910  	SHLQ	$32, DX
   911  	ADDQ	DX, AX
   912  	MOVQ	AX, ret+0(FP)
   913  	RET
   914  
   915  // hash function using AES hardware instructions
   916  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   917  	MOVQ	p+0(FP), AX	// ptr to data
   918  	MOVQ	s+16(FP), CX	// size
   919  	LEAQ	ret+24(FP), DX
   920  	JMP	runtime·aeshashbody(SB)
   921  
   922  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   923  	MOVQ	p+0(FP), AX	// ptr to string struct
   924  	MOVQ	8(AX), CX	// length of string
   925  	MOVQ	(AX), AX	// string data
   926  	LEAQ	ret+16(FP), DX
   927  	JMP	runtime·aeshashbody(SB)
   928  
   929  // AX: data
   930  // CX: length
   931  // DX: address to put return value
   932  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   933  	// Fill an SSE register with our seeds.
   934  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   935  	PINSRW	$4, CX, X0			// 16 bits of length
   936  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   937  	MOVO	X0, X1				// save unscrambled seed
   938  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   939  	AESENC	X0, X0				// scramble seed
   940  
   941  	CMPQ	CX, $16
   942  	JB	aes0to15
   943  	JE	aes16
   944  	CMPQ	CX, $32
   945  	JBE	aes17to32
   946  	CMPQ	CX, $64
   947  	JBE	aes33to64
   948  	CMPQ	CX, $128
   949  	JBE	aes65to128
   950  	JMP	aes129plus
   951  
   952  aes0to15:
   953  	TESTQ	CX, CX
   954  	JE	aes0
   955  
   956  	ADDQ	$16, AX
   957  	TESTW	$0xff0, AX
   958  	JE	endofpage
   959  
   960  	// 16 bytes loaded at this address won't cross
   961  	// a page boundary, so we can load it directly.
   962  	MOVOU	-16(AX), X1
   963  	ADDQ	CX, CX
   964  	MOVQ	$masks<>(SB), AX
   965  	PAND	(AX)(CX*8), X1
   966  final1:
   967  	PXOR	X0, X1	// xor data with seed
   968  	AESENC	X1, X1	// scramble combo 3 times
   969  	AESENC	X1, X1
   970  	AESENC	X1, X1
   971  	MOVQ	X1, (DX)
   972  	RET
   973  
   974  endofpage:
   975  	// address ends in 1111xxxx. Might be up against
   976  	// a page boundary, so load ending at last byte.
   977  	// Then shift bytes down using pshufb.
   978  	MOVOU	-32(AX)(CX*1), X1
   979  	ADDQ	CX, CX
   980  	MOVQ	$shifts<>(SB), AX
   981  	PSHUFB	(AX)(CX*8), X1
   982  	JMP	final1
   983  
   984  aes0:
   985  	// Return scrambled input seed
   986  	AESENC	X0, X0
   987  	MOVQ	X0, (DX)
   988  	RET
   989  
   990  aes16:
   991  	MOVOU	(AX), X1
   992  	JMP	final1
   993  
   994  aes17to32:
   995  	// make second starting seed
   996  	PXOR	runtime·aeskeysched+16(SB), X1
   997  	AESENC	X1, X1
   998  	
   999  	// load data to be hashed
  1000  	MOVOU	(AX), X2
  1001  	MOVOU	-16(AX)(CX*1), X3
  1002  
  1003  	// xor with seed
  1004  	PXOR	X0, X2
  1005  	PXOR	X1, X3
  1006  
  1007  	// scramble 3 times
  1008  	AESENC	X2, X2
  1009  	AESENC	X3, X3
  1010  	AESENC	X2, X2
  1011  	AESENC	X3, X3
  1012  	AESENC	X2, X2
  1013  	AESENC	X3, X3
  1014  
  1015  	// combine results
  1016  	PXOR	X3, X2
  1017  	MOVQ	X2, (DX)
  1018  	RET
  1019  
  1020  aes33to64:
  1021  	// make 3 more starting seeds
  1022  	MOVO	X1, X2
  1023  	MOVO	X1, X3
  1024  	PXOR	runtime·aeskeysched+16(SB), X1
  1025  	PXOR	runtime·aeskeysched+32(SB), X2
  1026  	PXOR	runtime·aeskeysched+48(SB), X3
  1027  	AESENC	X1, X1
  1028  	AESENC	X2, X2
  1029  	AESENC	X3, X3
  1030  	
  1031  	MOVOU	(AX), X4
  1032  	MOVOU	16(AX), X5
  1033  	MOVOU	-32(AX)(CX*1), X6
  1034  	MOVOU	-16(AX)(CX*1), X7
  1035  
  1036  	PXOR	X0, X4
  1037  	PXOR	X1, X5
  1038  	PXOR	X2, X6
  1039  	PXOR	X3, X7
  1040  	
  1041  	AESENC	X4, X4
  1042  	AESENC	X5, X5
  1043  	AESENC	X6, X6
  1044  	AESENC	X7, X7
  1045  	
  1046  	AESENC	X4, X4
  1047  	AESENC	X5, X5
  1048  	AESENC	X6, X6
  1049  	AESENC	X7, X7
  1050  	
  1051  	AESENC	X4, X4
  1052  	AESENC	X5, X5
  1053  	AESENC	X6, X6
  1054  	AESENC	X7, X7
  1055  
  1056  	PXOR	X6, X4
  1057  	PXOR	X7, X5
  1058  	PXOR	X5, X4
  1059  	MOVQ	X4, (DX)
  1060  	RET
  1061  
  1062  aes65to128:
  1063  	// make 7 more starting seeds
  1064  	MOVO	X1, X2
  1065  	MOVO	X1, X3
  1066  	MOVO	X1, X4
  1067  	MOVO	X1, X5
  1068  	MOVO	X1, X6
  1069  	MOVO	X1, X7
  1070  	PXOR	runtime·aeskeysched+16(SB), X1
  1071  	PXOR	runtime·aeskeysched+32(SB), X2
  1072  	PXOR	runtime·aeskeysched+48(SB), X3
  1073  	PXOR	runtime·aeskeysched+64(SB), X4
  1074  	PXOR	runtime·aeskeysched+80(SB), X5
  1075  	PXOR	runtime·aeskeysched+96(SB), X6
  1076  	PXOR	runtime·aeskeysched+112(SB), X7
  1077  	AESENC	X1, X1
  1078  	AESENC	X2, X2
  1079  	AESENC	X3, X3
  1080  	AESENC	X4, X4
  1081  	AESENC	X5, X5
  1082  	AESENC	X6, X6
  1083  	AESENC	X7, X7
  1084  
  1085  	// load data
  1086  	MOVOU	(AX), X8
  1087  	MOVOU	16(AX), X9
  1088  	MOVOU	32(AX), X10
  1089  	MOVOU	48(AX), X11
  1090  	MOVOU	-64(AX)(CX*1), X12
  1091  	MOVOU	-48(AX)(CX*1), X13
  1092  	MOVOU	-32(AX)(CX*1), X14
  1093  	MOVOU	-16(AX)(CX*1), X15
  1094  
  1095  	// xor with seed
  1096  	PXOR	X0, X8
  1097  	PXOR	X1, X9
  1098  	PXOR	X2, X10
  1099  	PXOR	X3, X11
  1100  	PXOR	X4, X12
  1101  	PXOR	X5, X13
  1102  	PXOR	X6, X14
  1103  	PXOR	X7, X15
  1104  
  1105  	// scramble 3 times
  1106  	AESENC	X8, X8
  1107  	AESENC	X9, X9
  1108  	AESENC	X10, X10
  1109  	AESENC	X11, X11
  1110  	AESENC	X12, X12
  1111  	AESENC	X13, X13
  1112  	AESENC	X14, X14
  1113  	AESENC	X15, X15
  1114  
  1115  	AESENC	X8, X8
  1116  	AESENC	X9, X9
  1117  	AESENC	X10, X10
  1118  	AESENC	X11, X11
  1119  	AESENC	X12, X12
  1120  	AESENC	X13, X13
  1121  	AESENC	X14, X14
  1122  	AESENC	X15, X15
  1123  
  1124  	AESENC	X8, X8
  1125  	AESENC	X9, X9
  1126  	AESENC	X10, X10
  1127  	AESENC	X11, X11
  1128  	AESENC	X12, X12
  1129  	AESENC	X13, X13
  1130  	AESENC	X14, X14
  1131  	AESENC	X15, X15
  1132  
  1133  	// combine results
  1134  	PXOR	X12, X8
  1135  	PXOR	X13, X9
  1136  	PXOR	X14, X10
  1137  	PXOR	X15, X11
  1138  	PXOR	X10, X8
  1139  	PXOR	X11, X9
  1140  	PXOR	X9, X8
  1141  	MOVQ	X8, (DX)
  1142  	RET
  1143  
  1144  aes129plus:
  1145  	// make 7 more starting seeds
  1146  	MOVO	X1, X2
  1147  	MOVO	X1, X3
  1148  	MOVO	X1, X4
  1149  	MOVO	X1, X5
  1150  	MOVO	X1, X6
  1151  	MOVO	X1, X7
  1152  	PXOR	runtime·aeskeysched+16(SB), X1
  1153  	PXOR	runtime·aeskeysched+32(SB), X2
  1154  	PXOR	runtime·aeskeysched+48(SB), X3
  1155  	PXOR	runtime·aeskeysched+64(SB), X4
  1156  	PXOR	runtime·aeskeysched+80(SB), X5
  1157  	PXOR	runtime·aeskeysched+96(SB), X6
  1158  	PXOR	runtime·aeskeysched+112(SB), X7
  1159  	AESENC	X1, X1
  1160  	AESENC	X2, X2
  1161  	AESENC	X3, X3
  1162  	AESENC	X4, X4
  1163  	AESENC	X5, X5
  1164  	AESENC	X6, X6
  1165  	AESENC	X7, X7
  1166  	
  1167  	// start with last (possibly overlapping) block
  1168  	MOVOU	-128(AX)(CX*1), X8
  1169  	MOVOU	-112(AX)(CX*1), X9
  1170  	MOVOU	-96(AX)(CX*1), X10
  1171  	MOVOU	-80(AX)(CX*1), X11
  1172  	MOVOU	-64(AX)(CX*1), X12
  1173  	MOVOU	-48(AX)(CX*1), X13
  1174  	MOVOU	-32(AX)(CX*1), X14
  1175  	MOVOU	-16(AX)(CX*1), X15
  1176  
  1177  	// xor in seed
  1178  	PXOR	X0, X8
  1179  	PXOR	X1, X9
  1180  	PXOR	X2, X10
  1181  	PXOR	X3, X11
  1182  	PXOR	X4, X12
  1183  	PXOR	X5, X13
  1184  	PXOR	X6, X14
  1185  	PXOR	X7, X15
  1186  	
  1187  	// compute number of remaining 128-byte blocks
  1188  	DECQ	CX
  1189  	SHRQ	$7, CX
  1190  	
  1191  aesloop:
  1192  	// scramble state
  1193  	AESENC	X8, X8
  1194  	AESENC	X9, X9
  1195  	AESENC	X10, X10
  1196  	AESENC	X11, X11
  1197  	AESENC	X12, X12
  1198  	AESENC	X13, X13
  1199  	AESENC	X14, X14
  1200  	AESENC	X15, X15
  1201  
  1202  	// scramble state, xor in a block
  1203  	MOVOU	(AX), X0
  1204  	MOVOU	16(AX), X1
  1205  	MOVOU	32(AX), X2
  1206  	MOVOU	48(AX), X3
  1207  	AESENC	X0, X8
  1208  	AESENC	X1, X9
  1209  	AESENC	X2, X10
  1210  	AESENC	X3, X11
  1211  	MOVOU	64(AX), X4
  1212  	MOVOU	80(AX), X5
  1213  	MOVOU	96(AX), X6
  1214  	MOVOU	112(AX), X7
  1215  	AESENC	X4, X12
  1216  	AESENC	X5, X13
  1217  	AESENC	X6, X14
  1218  	AESENC	X7, X15
  1219  
  1220  	ADDQ	$128, AX
  1221  	DECQ	CX
  1222  	JNE	aesloop
  1223  
  1224  	// 3 more scrambles to finish
  1225  	AESENC	X8, X8
  1226  	AESENC	X9, X9
  1227  	AESENC	X10, X10
  1228  	AESENC	X11, X11
  1229  	AESENC	X12, X12
  1230  	AESENC	X13, X13
  1231  	AESENC	X14, X14
  1232  	AESENC	X15, X15
  1233  	AESENC	X8, X8
  1234  	AESENC	X9, X9
  1235  	AESENC	X10, X10
  1236  	AESENC	X11, X11
  1237  	AESENC	X12, X12
  1238  	AESENC	X13, X13
  1239  	AESENC	X14, X14
  1240  	AESENC	X15, X15
  1241  	AESENC	X8, X8
  1242  	AESENC	X9, X9
  1243  	AESENC	X10, X10
  1244  	AESENC	X11, X11
  1245  	AESENC	X12, X12
  1246  	AESENC	X13, X13
  1247  	AESENC	X14, X14
  1248  	AESENC	X15, X15
  1249  
  1250  	PXOR	X12, X8
  1251  	PXOR	X13, X9
  1252  	PXOR	X14, X10
  1253  	PXOR	X15, X11
  1254  	PXOR	X10, X8
  1255  	PXOR	X11, X9
  1256  	PXOR	X9, X8
  1257  	MOVQ	X8, (DX)
  1258  	RET
  1259  	
  1260  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1261  	MOVQ	p+0(FP), AX	// ptr to data
  1262  	MOVQ	h+8(FP), X0	// seed
  1263  	PINSRD	$2, (AX), X0	// data
  1264  	AESENC	runtime·aeskeysched+0(SB), X0
  1265  	AESENC	runtime·aeskeysched+16(SB), X0
  1266  	AESENC	runtime·aeskeysched+32(SB), X0
  1267  	MOVQ	X0, ret+16(FP)
  1268  	RET
  1269  
  1270  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1271  	MOVQ	p+0(FP), AX	// ptr to data
  1272  	MOVQ	h+8(FP), X0	// seed
  1273  	PINSRQ	$1, (AX), X0	// data
  1274  	AESENC	runtime·aeskeysched+0(SB), X0
  1275  	AESENC	runtime·aeskeysched+16(SB), X0
  1276  	AESENC	runtime·aeskeysched+32(SB), X0
  1277  	MOVQ	X0, ret+16(FP)
  1278  	RET
  1279  
  1280  // simple mask to get rid of data in the high part of the register.
  1281  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1282  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1283  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1284  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1285  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1286  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1287  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1288  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1289  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1290  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1291  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1292  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1293  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1294  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1295  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1296  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1297  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1298  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1299  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1300  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1301  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1302  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1303  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1304  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1305  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1306  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1307  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1308  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1309  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1310  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1311  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1312  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1313  GLOBL masks<>(SB),RODATA,$256
  1314  
  1315  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1316  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1317  	MOVQ	$masks<>(SB), AX
  1318  	MOVQ	$shifts<>(SB), BX
  1319  	ORQ	BX, AX
  1320  	TESTQ	$15, AX
  1321  	SETEQ	ret+0(FP)
  1322  	RET
  1323  
  1324  // these are arguments to pshufb. They move data down from
  1325  // the high bytes of the register to the low bytes of the register.
  1326  // index is how many bytes to move.
  1327  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1328  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1329  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1330  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1331  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1332  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1333  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1334  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1335  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1336  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1337  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1338  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1339  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1340  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1341  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1342  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1343  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1344  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1345  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1346  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1347  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1348  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1349  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1350  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1351  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1352  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1353  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1354  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1355  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1356  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1357  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1358  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1359  GLOBL shifts<>(SB),RODATA,$256
  1360  
  1361  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1362  TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1363  	MOVQ	a+0(FP), SI
  1364  	MOVQ	b+8(FP), DI
  1365  	CMPQ	SI, DI
  1366  	JEQ	eq
  1367  	MOVQ	size+16(FP), BX
  1368  	LEAQ	ret+24(FP), AX
  1369  	JMP	runtime·memeqbody(SB)
  1370  eq:
  1371  	MOVB	$1, ret+24(FP)
  1372  	RET
  1373  
  1374  // memequal_varlen(a, b unsafe.Pointer) bool
  1375  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1376  	MOVQ	a+0(FP), SI
  1377  	MOVQ	b+8(FP), DI
  1378  	CMPQ	SI, DI
  1379  	JEQ	eq
  1380  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1381  	LEAQ	ret+16(FP), AX
  1382  	JMP	runtime·memeqbody(SB)
  1383  eq:
  1384  	MOVB	$1, ret+16(FP)
  1385  	RET
  1386  
  1387  // a in SI
  1388  // b in DI
  1389  // count in BX
  1390  // address of result byte in AX
  1391  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1392  	CMPQ	BX, $8
  1393  	JB	small
  1394  	CMPQ	BX, $64
  1395  	JB	bigloop
  1396  	CMPB    runtime·support_avx2(SB), $1
  1397  	JE	hugeloop_avx2
  1398  	
  1399  	// 64 bytes at a time using xmm registers
  1400  hugeloop:
  1401  	CMPQ	BX, $64
  1402  	JB	bigloop
  1403  	MOVOU	(SI), X0
  1404  	MOVOU	(DI), X1
  1405  	MOVOU	16(SI), X2
  1406  	MOVOU	16(DI), X3
  1407  	MOVOU	32(SI), X4
  1408  	MOVOU	32(DI), X5
  1409  	MOVOU	48(SI), X6
  1410  	MOVOU	48(DI), X7
  1411  	PCMPEQB	X1, X0
  1412  	PCMPEQB	X3, X2
  1413  	PCMPEQB	X5, X4
  1414  	PCMPEQB	X7, X6
  1415  	PAND	X2, X0
  1416  	PAND	X6, X4
  1417  	PAND	X4, X0
  1418  	PMOVMSKB X0, DX
  1419  	ADDQ	$64, SI
  1420  	ADDQ	$64, DI
  1421  	SUBQ	$64, BX
  1422  	CMPL	DX, $0xffff
  1423  	JEQ	hugeloop
  1424  	MOVB	$0, (AX)
  1425  	RET
  1426  
  1427  	// 64 bytes at a time using ymm registers
  1428  hugeloop_avx2:
  1429  	CMPQ	BX, $64
  1430  	JB	bigloop_avx2
  1431  	VMOVDQU	(SI), Y0
  1432  	VMOVDQU	(DI), Y1
  1433  	VMOVDQU	32(SI), Y2
  1434  	VMOVDQU	32(DI), Y3
  1435  	VPCMPEQB	Y1, Y0, Y4
  1436  	VPCMPEQB	Y2, Y3, Y5
  1437  	VPAND	Y4, Y5, Y6
  1438  	VPMOVMSKB Y6, DX
  1439  	ADDQ	$64, SI
  1440  	ADDQ	$64, DI
  1441  	SUBQ	$64, BX
  1442  	CMPL	DX, $0xffffffff
  1443  	JEQ	hugeloop_avx2
  1444  	VZEROUPPER
  1445  	MOVB	$0, (AX)
  1446  	RET
  1447  
  1448  bigloop_avx2:
  1449  	VZEROUPPER
  1450  
  1451  	// 8 bytes at a time using 64-bit register
  1452  bigloop:
  1453  	CMPQ	BX, $8
  1454  	JBE	leftover
  1455  	MOVQ	(SI), CX
  1456  	MOVQ	(DI), DX
  1457  	ADDQ	$8, SI
  1458  	ADDQ	$8, DI
  1459  	SUBQ	$8, BX
  1460  	CMPQ	CX, DX
  1461  	JEQ	bigloop
  1462  	MOVB	$0, (AX)
  1463  	RET
  1464  
  1465  	// remaining 0-8 bytes
  1466  leftover:
  1467  	MOVQ	-8(SI)(BX*1), CX
  1468  	MOVQ	-8(DI)(BX*1), DX
  1469  	CMPQ	CX, DX
  1470  	SETEQ	(AX)
  1471  	RET
  1472  
  1473  small:
  1474  	CMPQ	BX, $0
  1475  	JEQ	equal
  1476  
  1477  	LEAQ	0(BX*8), CX
  1478  	NEGQ	CX
  1479  
  1480  	CMPB	SI, $0xf8
  1481  	JA	si_high
  1482  
  1483  	// load at SI won't cross a page boundary.
  1484  	MOVQ	(SI), SI
  1485  	JMP	si_finish
  1486  si_high:
  1487  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1488  	MOVQ	-8(SI)(BX*1), SI
  1489  	SHRQ	CX, SI
  1490  si_finish:
  1491  
  1492  	// same for DI.
  1493  	CMPB	DI, $0xf8
  1494  	JA	di_high
  1495  	MOVQ	(DI), DI
  1496  	JMP	di_finish
  1497  di_high:
  1498  	MOVQ	-8(DI)(BX*1), DI
  1499  	SHRQ	CX, DI
  1500  di_finish:
  1501  
  1502  	SUBQ	SI, DI
  1503  	SHLQ	CX, DI
  1504  equal:
  1505  	SETEQ	(AX)
  1506  	RET
  1507  
  1508  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1509  	MOVQ	s1_base+0(FP), SI
  1510  	MOVQ	s1_len+8(FP), BX
  1511  	MOVQ	s2_base+16(FP), DI
  1512  	MOVQ	s2_len+24(FP), DX
  1513  	LEAQ	ret+32(FP), R9
  1514  	JMP	runtime·cmpbody(SB)
  1515  
  1516  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1517  	MOVQ	s1+0(FP), SI
  1518  	MOVQ	s1+8(FP), BX
  1519  	MOVQ	s2+24(FP), DI
  1520  	MOVQ	s2+32(FP), DX
  1521  	LEAQ	res+48(FP), R9
  1522  	JMP	runtime·cmpbody(SB)
  1523  
  1524  // input:
  1525  //   SI = a
  1526  //   DI = b
  1527  //   BX = alen
  1528  //   DX = blen
  1529  //   R9 = address of output word (stores -1/0/1 here)
  1530  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1531  	CMPQ	SI, DI
  1532  	JEQ	allsame
  1533  	CMPQ	BX, DX
  1534  	MOVQ	DX, R8
  1535  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1536  	CMPQ	R8, $8
  1537  	JB	small
  1538  
  1539  	CMPQ	R8, $63
  1540  	JBE	loop
  1541  	CMPB    runtime·support_avx2(SB), $1
  1542  	JEQ     big_loop_avx2
  1543  	JMP	big_loop
  1544  loop:
  1545  	CMPQ	R8, $16
  1546  	JBE	_0through16
  1547  	MOVOU	(SI), X0
  1548  	MOVOU	(DI), X1
  1549  	PCMPEQB X0, X1
  1550  	PMOVMSKB X1, AX
  1551  	XORQ	$0xffff, AX	// convert EQ to NE
  1552  	JNE	diff16	// branch if at least one byte is not equal
  1553  	ADDQ	$16, SI
  1554  	ADDQ	$16, DI
  1555  	SUBQ	$16, R8
  1556  	JMP	loop
  1557  	
  1558  diff64:
  1559  	ADDQ	$48, SI
  1560  	ADDQ	$48, DI
  1561  	JMP	diff16
  1562  diff48:
  1563  	ADDQ	$32, SI
  1564  	ADDQ	$32, DI
  1565  	JMP	diff16
  1566  diff32:
  1567  	ADDQ	$16, SI
  1568  	ADDQ	$16, DI
  1569  	// AX = bit mask of differences
  1570  diff16:
  1571  	BSFQ	AX, BX	// index of first byte that differs
  1572  	XORQ	AX, AX
  1573  	MOVB	(SI)(BX*1), CX
  1574  	CMPB	CX, (DI)(BX*1)
  1575  	SETHI	AX
  1576  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1577  	MOVQ	AX, (R9)
  1578  	RET
  1579  
  1580  	// 0 through 16 bytes left, alen>=8, blen>=8
  1581  _0through16:
  1582  	CMPQ	R8, $8
  1583  	JBE	_0through8
  1584  	MOVQ	(SI), AX
  1585  	MOVQ	(DI), CX
  1586  	CMPQ	AX, CX
  1587  	JNE	diff8
  1588  _0through8:
  1589  	MOVQ	-8(SI)(R8*1), AX
  1590  	MOVQ	-8(DI)(R8*1), CX
  1591  	CMPQ	AX, CX
  1592  	JEQ	allsame
  1593  
  1594  	// AX and CX contain parts of a and b that differ.
  1595  diff8:
  1596  	BSWAPQ	AX	// reverse order of bytes
  1597  	BSWAPQ	CX
  1598  	XORQ	AX, CX
  1599  	BSRQ	CX, CX	// index of highest bit difference
  1600  	SHRQ	CX, AX	// move a's bit to bottom
  1601  	ANDQ	$1, AX	// mask bit
  1602  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1603  	MOVQ	AX, (R9)
  1604  	RET
  1605  
  1606  	// 0-7 bytes in common
  1607  small:
  1608  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1609  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1610  	JEQ	allsame
  1611  
  1612  	// load bytes of a into high bytes of AX
  1613  	CMPB	SI, $0xf8
  1614  	JA	si_high
  1615  	MOVQ	(SI), SI
  1616  	JMP	si_finish
  1617  si_high:
  1618  	MOVQ	-8(SI)(R8*1), SI
  1619  	SHRQ	CX, SI
  1620  si_finish:
  1621  	SHLQ	CX, SI
  1622  
  1623  	// load bytes of b in to high bytes of BX
  1624  	CMPB	DI, $0xf8
  1625  	JA	di_high
  1626  	MOVQ	(DI), DI
  1627  	JMP	di_finish
  1628  di_high:
  1629  	MOVQ	-8(DI)(R8*1), DI
  1630  	SHRQ	CX, DI
  1631  di_finish:
  1632  	SHLQ	CX, DI
  1633  
  1634  	BSWAPQ	SI	// reverse order of bytes
  1635  	BSWAPQ	DI
  1636  	XORQ	SI, DI	// find bit differences
  1637  	JEQ	allsame
  1638  	BSRQ	DI, CX	// index of highest bit difference
  1639  	SHRQ	CX, SI	// move a's bit to bottom
  1640  	ANDQ	$1, SI	// mask bit
  1641  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1642  	MOVQ	AX, (R9)
  1643  	RET
  1644  
  1645  allsame:
  1646  	XORQ	AX, AX
  1647  	XORQ	CX, CX
  1648  	CMPQ	BX, DX
  1649  	SETGT	AX	// 1 if alen > blen
  1650  	SETEQ	CX	// 1 if alen == blen
  1651  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1652  	MOVQ	AX, (R9)
  1653  	RET
  1654  
  1655  	// this works for >= 64 bytes of data.
  1656  big_loop:
  1657  	MOVOU	(SI), X0
  1658  	MOVOU	(DI), X1
  1659  	PCMPEQB X0, X1
  1660  	PMOVMSKB X1, AX
  1661  	XORQ	$0xffff, AX
  1662  	JNE	diff16
  1663  
  1664  	MOVOU	16(SI), X0
  1665  	MOVOU	16(DI), X1
  1666  	PCMPEQB X0, X1
  1667  	PMOVMSKB X1, AX
  1668  	XORQ	$0xffff, AX
  1669  	JNE	diff32
  1670  
  1671  	MOVOU	32(SI), X0
  1672  	MOVOU	32(DI), X1
  1673  	PCMPEQB X0, X1
  1674  	PMOVMSKB X1, AX
  1675  	XORQ	$0xffff, AX
  1676  	JNE	diff48
  1677  
  1678  	MOVOU	48(SI), X0
  1679  	MOVOU	48(DI), X1
  1680  	PCMPEQB X0, X1
  1681  	PMOVMSKB X1, AX
  1682  	XORQ	$0xffff, AX
  1683  	JNE	diff64
  1684  
  1685  	ADDQ	$64, SI
  1686  	ADDQ	$64, DI
  1687  	SUBQ	$64, R8
  1688  	CMPQ	R8, $64
  1689  	JBE	loop
  1690  	JMP	big_loop
  1691  
  1692  	// Compare 64-bytes per loop iteration.
  1693  	// Loop is unrolled and uses AVX2.
  1694  big_loop_avx2:
  1695  	VMOVDQU	(SI), Y2
  1696  	VMOVDQU	(DI), Y3
  1697  	VMOVDQU	32(SI), Y4
  1698  	VMOVDQU	32(DI), Y5
  1699  	VPCMPEQB Y2, Y3, Y0
  1700  	VPMOVMSKB Y0, AX
  1701  	XORL	$0xffffffff, AX
  1702  	JNE	diff32_avx2
  1703  	VPCMPEQB Y4, Y5, Y6
  1704  	VPMOVMSKB Y6, AX
  1705  	XORL	$0xffffffff, AX
  1706  	JNE	diff64_avx2
  1707  
  1708  	ADDQ	$64, SI
  1709  	ADDQ	$64, DI
  1710  	SUBQ	$64, R8
  1711  	CMPQ	R8, $64
  1712  	JB	big_loop_avx2_exit
  1713  	JMP	big_loop_avx2
  1714  
  1715  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1716  diff32_avx2:
  1717  	VZEROUPPER
  1718  	JMP diff16
  1719  
  1720  	// Same as diff32_avx2, but for last 32 bytes.
  1721  diff64_avx2:
  1722  	VZEROUPPER
  1723  	JMP diff48
  1724  
  1725  	// For <64 bytes remainder jump to normal loop.
  1726  big_loop_avx2_exit:
  1727  	VZEROUPPER
  1728  	JMP loop
  1729  
  1730  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1731  	MOVQ s+0(FP), DI
  1732  	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1733  	MOVQ s_len+8(FP), DX
  1734  	MOVQ c+16(FP), BP
  1735  	MOVQ c_len+24(FP), AX
  1736  	MOVQ DI, R10
  1737  	LEAQ ret+32(FP), R11
  1738  	JMP  runtime·indexShortStr(SB)
  1739  
  1740  TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
  1741  	MOVQ s+0(FP), DI
  1742  	MOVQ s_len+8(FP), DX
  1743  	MOVQ c+24(FP), BP
  1744  	MOVQ c_len+32(FP), AX
  1745  	MOVQ DI, R10
  1746  	LEAQ ret+48(FP), R11
  1747  	JMP  runtime·indexShortStr(SB)
  1748  
  1749  // AX: length of string, that we are searching for
  1750  // DX: length of string, in which we are searching
  1751  // DI: pointer to string, in which we are searching
  1752  // BP: pointer to string, that we are searching for
  1753  // R11: address, where to put return value
  1754  TEXT runtime·indexShortStr(SB),NOSPLIT,$0
  1755  	CMPQ AX, DX
  1756  	JA fail
  1757  	CMPQ DX, $16
  1758  	JAE sse42
  1759  no_sse42:
  1760  	CMPQ AX, $2
  1761  	JA   _3_or_more
  1762  	MOVW (BP), BP
  1763  	LEAQ -1(DI)(DX*1), DX
  1764  loop2:
  1765  	MOVW (DI), SI
  1766  	CMPW SI,BP
  1767  	JZ success
  1768  	ADDQ $1,DI
  1769  	CMPQ DI,DX
  1770  	JB loop2
  1771  	JMP fail
  1772  _3_or_more:
  1773  	CMPQ AX, $3
  1774  	JA   _4_or_more
  1775  	MOVW 1(BP), BX
  1776  	MOVW (BP), BP
  1777  	LEAQ -2(DI)(DX*1), DX
  1778  loop3:
  1779  	MOVW (DI), SI
  1780  	CMPW SI,BP
  1781  	JZ   partial_success3
  1782  	ADDQ $1,DI
  1783  	CMPQ DI,DX
  1784  	JB loop3
  1785  	JMP fail
  1786  partial_success3:
  1787  	MOVW 1(DI), SI
  1788  	CMPW SI,BX
  1789  	JZ success
  1790  	ADDQ $1,DI
  1791  	CMPQ DI,DX
  1792  	JB loop3
  1793  	JMP fail
  1794  _4_or_more:
  1795  	CMPQ AX, $4
  1796  	JA   _5_or_more
  1797  	MOVL (BP), BP
  1798  	LEAQ -3(DI)(DX*1), DX
  1799  loop4:
  1800  	MOVL (DI), SI
  1801  	CMPL SI,BP
  1802  	JZ   success
  1803  	ADDQ $1,DI
  1804  	CMPQ DI,DX
  1805  	JB loop4
  1806  	JMP fail
  1807  _5_or_more:
  1808  	CMPQ AX, $7
  1809  	JA   _8_or_more
  1810  	LEAQ 1(DI)(DX*1), DX
  1811  	SUBQ AX, DX
  1812  	MOVL -4(BP)(AX*1), BX
  1813  	MOVL (BP), BP
  1814  loop5to7:
  1815  	MOVL (DI), SI
  1816  	CMPL SI,BP
  1817  	JZ   partial_success5to7
  1818  	ADDQ $1,DI
  1819  	CMPQ DI,DX
  1820  	JB loop5to7
  1821  	JMP fail
  1822  partial_success5to7:
  1823  	MOVL -4(AX)(DI*1), SI
  1824  	CMPL SI,BX
  1825  	JZ success
  1826  	ADDQ $1,DI
  1827  	CMPQ DI,DX
  1828  	JB loop5to7
  1829  	JMP fail
  1830  _8_or_more:
  1831  	CMPQ AX, $8
  1832  	JA   _9_or_more
  1833  	MOVQ (BP), BP
  1834  	LEAQ -7(DI)(DX*1), DX
  1835  loop8:
  1836  	MOVQ (DI), SI
  1837  	CMPQ SI,BP
  1838  	JZ   success
  1839  	ADDQ $1,DI
  1840  	CMPQ DI,DX
  1841  	JB loop8
  1842  	JMP fail
  1843  _9_or_more:
  1844  	CMPQ AX, $15
  1845  	JA   _16_or_more
  1846  	LEAQ 1(DI)(DX*1), DX
  1847  	SUBQ AX, DX
  1848  	MOVQ -8(BP)(AX*1), BX
  1849  	MOVQ (BP), BP
  1850  loop9to15:
  1851  	MOVQ (DI), SI
  1852  	CMPQ SI,BP
  1853  	JZ   partial_success9to15
  1854  	ADDQ $1,DI
  1855  	CMPQ DI,DX
  1856  	JB loop9to15
  1857  	JMP fail
  1858  partial_success9to15:
  1859  	MOVQ -8(AX)(DI*1), SI
  1860  	CMPQ SI,BX
  1861  	JZ success
  1862  	ADDQ $1,DI
  1863  	CMPQ DI,DX
  1864  	JB loop9to15
  1865  	JMP fail
  1866  _16_or_more:
  1867  	CMPQ AX, $16
  1868  	JA   _17_or_more
  1869  	MOVOU (BP), X1
  1870  	LEAQ -15(DI)(DX*1), DX
  1871  loop16:
  1872  	MOVOU (DI), X2
  1873  	PCMPEQB X1, X2
  1874  	PMOVMSKB X2, SI
  1875  	CMPQ  SI, $0xffff
  1876  	JE   success
  1877  	ADDQ $1,DI
  1878  	CMPQ DI,DX
  1879  	JB loop16
  1880  	JMP fail
  1881  _17_or_more:
  1882  	CMPQ AX, $31
  1883  	JA   _32_or_more
  1884  	LEAQ 1(DI)(DX*1), DX
  1885  	SUBQ AX, DX
  1886  	MOVOU -16(BP)(AX*1), X0
  1887  	MOVOU (BP), X1
  1888  loop17to31:
  1889  	MOVOU (DI), X2
  1890  	PCMPEQB X1,X2
  1891  	PMOVMSKB X2, SI
  1892  	CMPQ  SI, $0xffff
  1893  	JE   partial_success17to31
  1894  	ADDQ $1,DI
  1895  	CMPQ DI,DX
  1896  	JB loop17to31
  1897  	JMP fail
  1898  partial_success17to31:
  1899  	MOVOU -16(AX)(DI*1), X3
  1900  	PCMPEQB X0, X3
  1901  	PMOVMSKB X3, SI
  1902  	CMPQ  SI, $0xffff
  1903  	JE success
  1904  	ADDQ $1,DI
  1905  	CMPQ DI,DX
  1906  	JB loop17to31
  1907  	JMP fail
  1908  // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
  1909  // So no need to check cpuid
  1910  _32_or_more:
  1911  	CMPQ AX, $32
  1912  	JA   _33_to_63
  1913  	VMOVDQU (BP), Y1
  1914  	LEAQ -31(DI)(DX*1), DX
  1915  loop32:
  1916  	VMOVDQU (DI), Y2
  1917  	VPCMPEQB Y1, Y2, Y3
  1918  	VPMOVMSKB Y3, SI
  1919  	CMPL  SI, $0xffffffff
  1920  	JE   success_avx2
  1921  	ADDQ $1,DI
  1922  	CMPQ DI,DX
  1923  	JB loop32
  1924  	JMP fail_avx2
  1925  _33_to_63:
  1926  	LEAQ 1(DI)(DX*1), DX
  1927  	SUBQ AX, DX
  1928  	VMOVDQU -32(BP)(AX*1), Y0
  1929  	VMOVDQU (BP), Y1
  1930  loop33to63:
  1931  	VMOVDQU (DI), Y2
  1932  	VPCMPEQB Y1, Y2, Y3
  1933  	VPMOVMSKB Y3, SI
  1934  	CMPL  SI, $0xffffffff
  1935  	JE   partial_success33to63
  1936  	ADDQ $1,DI
  1937  	CMPQ DI,DX
  1938  	JB loop33to63
  1939  	JMP fail_avx2
  1940  partial_success33to63:
  1941  	VMOVDQU -32(AX)(DI*1), Y3
  1942  	VPCMPEQB Y0, Y3, Y4
  1943  	VPMOVMSKB Y4, SI
  1944  	CMPL  SI, $0xffffffff
  1945  	JE success_avx2
  1946  	ADDQ $1,DI
  1947  	CMPQ DI,DX
  1948  	JB loop33to63
  1949  fail_avx2:
  1950  	VZEROUPPER
  1951  fail:
  1952  	MOVQ $-1, (R11)
  1953  	RET
  1954  success_avx2:
  1955  	VZEROUPPER
  1956  	JMP success
  1957  sse42:
  1958  	CMPB runtime·support_sse42(SB), $1
  1959  	JNE no_sse42
  1960  	CMPQ AX, $12
  1961  	// PCMPESTRI is slower than normal compare,
  1962  	// so using it makes sense only if we advance 4+ bytes per compare
  1963  	// This value was determined experimentally and is the ~same
  1964  	// on Nehalem (first with SSE42) and Haswell.
  1965  	JAE _9_or_more
  1966  	LEAQ 16(BP), SI
  1967  	TESTW $0xff0, SI
  1968  	JEQ no_sse42
  1969  	MOVOU (BP), X1
  1970  	LEAQ -15(DI)(DX*1), SI
  1971  	MOVQ $16, R9
  1972  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1973  loop_sse42:
  1974  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1975  	// for equality (bits 2,3 are 11)
  1976  	// result is not masked or inverted (bits 4,5 are 00)
  1977  	// and corresponds to first matching byte (bit 6 is 0)
  1978  	PCMPESTRI $0x0c, (DI), X1
  1979  	// CX == 16 means no match,
  1980  	// CX > R9 means partial match at the end of the string,
  1981  	// otherwise sep is at offset CX from X1 start
  1982  	CMPQ CX, R9
  1983  	JBE sse42_success
  1984  	ADDQ R9, DI
  1985  	CMPQ DI, SI
  1986  	JB loop_sse42
  1987  	PCMPESTRI $0x0c, -1(SI), X1
  1988  	CMPQ CX, R9
  1989  	JA fail
  1990  	LEAQ -1(SI), DI
  1991  sse42_success:
  1992  	ADDQ CX, DI
  1993  success:
  1994  	SUBQ R10, DI
  1995  	MOVQ DI, (R11)
  1996  	RET
  1997  
  1998  
  1999  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  2000  	MOVQ s+0(FP), SI
  2001  	MOVQ s_len+8(FP), BX
  2002  	MOVB c+24(FP), AL
  2003  	LEAQ ret+32(FP), R8
  2004  	JMP  runtime·indexbytebody(SB)
  2005  
  2006  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  2007  	MOVQ s+0(FP), SI
  2008  	MOVQ s_len+8(FP), BX
  2009  	MOVB c+16(FP), AL
  2010  	LEAQ ret+24(FP), R8
  2011  	JMP  runtime·indexbytebody(SB)
  2012  
  2013  // input:
  2014  //   SI: data
  2015  //   BX: data len
  2016  //   AL: byte sought
  2017  //   R8: address to put result
  2018  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  2019  	// Shuffle X0 around so that each byte contains
  2020  	// the character we're looking for.
  2021  	MOVD AX, X0
  2022  	PUNPCKLBW X0, X0
  2023  	PUNPCKLBW X0, X0
  2024  	PSHUFL $0, X0, X0
  2025  	
  2026  	CMPQ BX, $16
  2027  	JLT small
  2028  
  2029  	MOVQ SI, DI
  2030  
  2031  	CMPQ BX, $32
  2032  	JA avx2
  2033  sse:
  2034  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2035  	JMP	sseloopentry
  2036  	
  2037  sseloop:
  2038  	// Move the next 16-byte chunk of the data into X1.
  2039  	MOVOU	(DI), X1
  2040  	// Compare bytes in X0 to X1.
  2041  	PCMPEQB	X0, X1
  2042  	// Take the top bit of each byte in X1 and put the result in DX.
  2043  	PMOVMSKB X1, DX
  2044  	// Find first set bit, if any.
  2045  	BSFL	DX, DX
  2046  	JNZ	ssesuccess
  2047  	// Advance to next block.
  2048  	ADDQ	$16, DI
  2049  sseloopentry:
  2050  	CMPQ	DI, AX
  2051  	JB	sseloop
  2052  
  2053  	// Search the last 16-byte chunk. This chunk may overlap with the
  2054  	// chunks we've already searched, but that's ok.
  2055  	MOVQ	AX, DI
  2056  	MOVOU	(AX), X1
  2057  	PCMPEQB	X0, X1
  2058  	PMOVMSKB X1, DX
  2059  	BSFL	DX, DX
  2060  	JNZ	ssesuccess
  2061  
  2062  failure:
  2063  	MOVQ $-1, (R8)
  2064  	RET
  2065  
  2066  // We've found a chunk containing the byte.
  2067  // The chunk was loaded from DI.
  2068  // The index of the matching byte in the chunk is DX.
  2069  // The start of the data is SI.
  2070  ssesuccess:
  2071  	SUBQ SI, DI	// Compute offset of chunk within data.
  2072  	ADDQ DX, DI	// Add offset of byte within chunk.
  2073  	MOVQ DI, (R8)
  2074  	RET
  2075  
  2076  // handle for lengths < 16
  2077  small:
  2078  	TESTQ	BX, BX
  2079  	JEQ	failure
  2080  
  2081  	// Check if we'll load across a page boundary.
  2082  	LEAQ	16(SI), AX
  2083  	TESTW	$0xff0, AX
  2084  	JEQ	endofpage
  2085  
  2086  	MOVOU	(SI), X1 // Load data
  2087  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2088  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2089  	BSFL	DX, DX	// Find first set bit.
  2090  	JZ	failure	// No set bit, failure.
  2091  	CMPL	DX, BX
  2092  	JAE	failure	// Match is past end of data.
  2093  	MOVQ	DX, (R8)
  2094  	RET
  2095  
  2096  endofpage:
  2097  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2098  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2099  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2100  	MOVL	BX, CX
  2101  	SHLL	CX, DX
  2102  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2103  	BSFL	DX, DX	// Find first set bit.
  2104  	JZ	failure	// No set bit, failure.
  2105  	MOVQ	DX, (R8)
  2106  	RET
  2107  
  2108  avx2:
  2109  	CMPB   runtime·support_avx2(SB), $1
  2110  	JNE sse
  2111  	MOVD AX, X0
  2112  	LEAQ -32(SI)(BX*1), R11
  2113  	VPBROADCASTB  X0, Y1
  2114  avx2_loop:
  2115  	VMOVDQU (DI), Y2
  2116  	VPCMPEQB Y1, Y2, Y3
  2117  	VPTEST Y3, Y3
  2118  	JNZ avx2success
  2119  	ADDQ $32, DI
  2120  	CMPQ DI, R11
  2121  	JLT avx2_loop
  2122  	MOVQ R11, DI
  2123  	VMOVDQU (DI), Y2
  2124  	VPCMPEQB Y1, Y2, Y3
  2125  	VPTEST Y3, Y3
  2126  	JNZ avx2success
  2127  	VZEROUPPER
  2128  	MOVQ $-1, (R8)
  2129  	RET
  2130  
  2131  avx2success:
  2132  	VPMOVMSKB Y3, DX
  2133  	BSFL DX, DX
  2134  	SUBQ SI, DI
  2135  	ADDQ DI, DX
  2136  	MOVQ DX, (R8)
  2137  	VZEROUPPER
  2138  	RET
  2139  
  2140  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2141  	MOVQ	a_len+8(FP), BX
  2142  	MOVQ	b_len+32(FP), CX
  2143  	CMPQ	BX, CX
  2144  	JNE	eqret
  2145  	MOVQ	a+0(FP), SI
  2146  	MOVQ	b+24(FP), DI
  2147  	LEAQ	ret+48(FP), AX
  2148  	JMP	runtime·memeqbody(SB)
  2149  eqret:
  2150  	MOVB	$0, ret+48(FP)
  2151  	RET
  2152  
  2153  
  2154  TEXT bytes·countByte(SB),NOSPLIT,$0-40
  2155  	MOVQ s+0(FP), SI
  2156  	MOVQ s_len+8(FP), BX
  2157  	MOVB c+24(FP), AL
  2158  	LEAQ ret+32(FP), R8
  2159  	JMP  runtime·countByte(SB)
  2160  
  2161  TEXT strings·countByte(SB),NOSPLIT,$0-32
  2162  	MOVQ s+0(FP), SI
  2163  	MOVQ s_len+8(FP), BX
  2164  	MOVB c+16(FP), AL
  2165  	LEAQ ret+24(FP), R8
  2166  	JMP  runtime·countByte(SB)
  2167  
  2168  // input:
  2169  //   SI: data
  2170  //   BX: data len
  2171  //   AL: byte sought
  2172  //   R8: address to put result
  2173  // This requires the POPCNT instruction
  2174  TEXT runtime·countByte(SB),NOSPLIT,$0
  2175  	// Shuffle X0 around so that each byte contains
  2176  	// the character we're looking for.
  2177  	MOVD AX, X0
  2178  	PUNPCKLBW X0, X0
  2179  	PUNPCKLBW X0, X0
  2180  	PSHUFL $0, X0, X0
  2181  
  2182  	CMPQ BX, $16
  2183  	JLT small
  2184  
  2185  	MOVQ $0, R12 // Accumulator
  2186  
  2187  	MOVQ SI, DI
  2188  
  2189  	CMPQ BX, $32
  2190  	JA avx2
  2191  sse:
  2192  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2193  	JMP	sseloopentry
  2194  
  2195  sseloop:
  2196  	// Move the next 16-byte chunk of the data into X1.
  2197  	MOVOU	(DI), X1
  2198  	// Compare bytes in X0 to X1.
  2199  	PCMPEQB	X0, X1
  2200  	// Take the top bit of each byte in X1 and put the result in DX.
  2201  	PMOVMSKB X1, DX
  2202  	// Count number of matching bytes
  2203  	POPCNTL DX, DX
  2204  	// Accumulate into R12
  2205  	ADDQ DX, R12
  2206  	// Advance to next block.
  2207  	ADDQ	$16, DI
  2208  sseloopentry:
  2209  	CMPQ	DI, AX
  2210  	JBE	sseloop
  2211  
  2212  	// Get the number of bytes to consider in the last 16 bytes
  2213  	ANDQ $15, BX
  2214  	JZ end
  2215  
  2216  	// Create mask to ignore overlap between previous 16 byte block
  2217  	// and the next.
  2218  	MOVQ $16,CX
  2219  	SUBQ BX, CX
  2220  	MOVQ $0xFFFF, R10
  2221  	SARQ CL, R10
  2222  	SALQ CL, R10
  2223  
  2224  	// Process the last 16-byte chunk. This chunk may overlap with the
  2225  	// chunks we've already searched so we need to mask part of it.
  2226  	MOVOU	(AX), X1
  2227  	PCMPEQB	X0, X1
  2228  	PMOVMSKB X1, DX
  2229  	// Apply mask
  2230  	ANDQ R10, DX
  2231  	POPCNTL DX, DX
  2232  	ADDQ DX, R12
  2233  end:
  2234  	MOVQ R12, (R8)
  2235  	RET
  2236  
  2237  // handle for lengths < 16
  2238  small:
  2239  	TESTQ	BX, BX
  2240  	JEQ	endzero
  2241  
  2242  	// Check if we'll load across a page boundary.
  2243  	LEAQ	16(SI), AX
  2244  	TESTW	$0xff0, AX
  2245  	JEQ	endofpage
  2246  
  2247  	// We must ignore high bytes as they aren't part of our slice.
  2248  	// Create mask.
  2249  	MOVB BX, CX
  2250  	MOVQ $1, R10
  2251  	SALQ CL, R10
  2252  	SUBQ $1, R10
  2253  
  2254  	// Load data
  2255  	MOVOU	(SI), X1
  2256  	// Compare target byte with each byte in data.
  2257  	PCMPEQB	X0, X1
  2258  	// Move result bits to integer register.
  2259  	PMOVMSKB X1, DX
  2260  	// Apply mask
  2261  	ANDQ R10, DX
  2262  	POPCNTL DX, DX
  2263  	// Directly return DX, we don't need to accumulate
  2264  	// since we have <16 bytes.
  2265  	MOVQ	DX, (R8)
  2266  	RET
  2267  endzero:
  2268  	MOVQ $0, (R8)
  2269  	RET
  2270  
  2271  endofpage:
  2272  	// We must ignore low bytes as they aren't part of our slice.
  2273  	MOVQ $16,CX
  2274  	SUBQ BX, CX
  2275  	MOVQ $0xFFFF, R10
  2276  	SARQ CL, R10
  2277  	SALQ CL, R10
  2278  
  2279  	// Load data into the high end of X1.
  2280  	MOVOU	-16(SI)(BX*1), X1
  2281  	// Compare target byte with each byte in data.
  2282  	PCMPEQB	X0, X1
  2283  	// Move result bits to integer register.
  2284  	PMOVMSKB X1, DX
  2285  	// Apply mask
  2286  	ANDQ R10, DX
  2287  	// Directly return DX, we don't need to accumulate
  2288  	// since we have <16 bytes.
  2289  	POPCNTL DX, DX
  2290  	MOVQ	DX, (R8)
  2291  	RET
  2292  
  2293  avx2:
  2294  	CMPB   runtime·support_avx2(SB), $1
  2295  	JNE sse
  2296  	MOVD AX, X0
  2297  	LEAQ -32(SI)(BX*1), R11
  2298  	VPBROADCASTB  X0, Y1
  2299  avx2_loop:
  2300  	VMOVDQU (DI), Y2
  2301  	VPCMPEQB Y1, Y2, Y3
  2302  	VPMOVMSKB Y3, DX
  2303  	POPCNTL DX, DX
  2304  	ADDQ DX, R12
  2305  	ADDQ $32, DI
  2306  	CMPQ DI, R11
  2307  	JLE avx2_loop
  2308  
  2309  	// If last block is already processed,
  2310  	// skip to the end.
  2311  	CMPQ DI, R11
  2312  	JEQ endavx
  2313  
  2314  	// Load address of the last 32 bytes.
  2315  	// There is an overlap with the previous block.
  2316  	MOVQ R11, DI
  2317  	VMOVDQU (DI), Y2
  2318  	VPCMPEQB Y1, Y2, Y3
  2319  	VPMOVMSKB Y3, DX
  2320  	// Exit AVX mode.
  2321  	VZEROUPPER
  2322  
  2323  	// Create mask to ignore overlap between previous 32 byte block
  2324  	// and the next.
  2325  	ANDQ $31, BX
  2326  	MOVQ $32,CX
  2327  	SUBQ BX, CX
  2328  	MOVQ $0xFFFFFFFF, R10
  2329  	SARQ CL, R10
  2330  	SALQ CL, R10
  2331  	// Apply mask
  2332  	ANDQ R10, DX
  2333  	POPCNTL DX, DX
  2334  	ADDQ DX, R12
  2335  	MOVQ R12, (R8)
  2336  	RET
  2337  endavx:
  2338  	// Exit AVX mode.
  2339  	VZEROUPPER
  2340  	MOVQ R12, (R8)
  2341  	RET
  2342  
  2343  TEXT runtime·return0(SB), NOSPLIT, $0
  2344  	MOVL	$0, AX
  2345  	RET
  2346  
  2347  
  2348  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2349  // Must obey the gcc calling convention.
  2350  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2351  	get_tls(CX)
  2352  	MOVQ	g(CX), AX
  2353  	MOVQ	g_m(AX), AX
  2354  	MOVQ	m_curg(AX), AX
  2355  	MOVQ	(g_stack+stack_hi)(AX), AX
  2356  	RET
  2357  
  2358  // The top-most function running on a goroutine
  2359  // returns to goexit+PCQuantum.
  2360  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2361  	BYTE	$0x90	// NOP
  2362  	CALL	runtime·goexit1(SB)	// does not return
  2363  	// traceback from goexit1 must hit code range of goexit
  2364  	BYTE	$0x90	// NOP
  2365  
  2366  // This is called from .init_array and follows the platform, not Go, ABI.
  2367  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2368  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2369  	MOVQ	runtime·lastmoduledatap(SB), AX
  2370  	MOVQ	DI, moduledata_next(AX)
  2371  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2372  	POPQ	R15
  2373  	RET
  2374  
  2375  // gcWriteBarrier performs a heap pointer write and informs the GC.
  2376  //
  2377  // gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
  2378  // - DI is the destination of the write
  2379  // - AX is the value being written at DI
  2380  // It clobbers FLAGS. It does not clobber any general-purpose registers,
  2381  // but may clobber others (e.g., SSE registers).
  2382  TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$120
  2383  	// Save the registers clobbered by the fast path. This is slightly
  2384  	// faster than having the caller spill these.
  2385  	MOVQ	R14, 104(SP)
  2386  	MOVQ	R13, 112(SP)
  2387  	// TODO: Consider passing g.m.p in as an argument so they can be shared
  2388  	// across a sequence of write barriers.
  2389  	get_tls(R13)
  2390  	MOVQ	g(R13), R13
  2391  	MOVQ	g_m(R13), R13
  2392  	MOVQ	m_p(R13), R13
  2393  	MOVQ	(p_wbBuf+wbBuf_next)(R13), R14
  2394  	// Increment wbBuf.next position.
  2395  	LEAQ	16(R14), R14
  2396  	MOVQ	R14, (p_wbBuf+wbBuf_next)(R13)
  2397  	CMPQ	R14, (p_wbBuf+wbBuf_end)(R13)
  2398  	// Record the write.
  2399  	MOVQ	AX, -16(R14)	// Record value
  2400  	MOVQ	(DI), R13	// TODO: This turns bad writes into bad reads.
  2401  	MOVQ	R13, -8(R14)	// Record *slot
  2402  	// Is the buffer full? (flags set in CMPQ above)
  2403  	JEQ	flush
  2404  ret:
  2405  	MOVQ	104(SP), R14
  2406  	MOVQ	112(SP), R13
  2407  	// Do the write.
  2408  	MOVQ	AX, (DI)
  2409  	RET
  2410  
  2411  flush:
  2412  	// Save all general purpose registers since these could be
  2413  	// clobbered by wbBufFlush and were not saved by the caller.
  2414  	// It is possible for wbBufFlush to clobber other registers
  2415  	// (e.g., SSE registers), but the compiler takes care of saving
  2416  	// those in the caller if necessary. This strikes a balance
  2417  	// with registers that are likely to be used.
  2418  	//
  2419  	// We don't have type information for these, but all code under
  2420  	// here is NOSPLIT, so nothing will observe these.
  2421  	//
  2422  	// TODO: We could strike a different balance; e.g., saving X0
  2423  	// and not saving GP registers that are less likely to be used.
  2424  	MOVQ	DI, 0(SP)	// Also first argument to wbBufFlush
  2425  	MOVQ	AX, 8(SP)	// Also second argument to wbBufFlush
  2426  	MOVQ	BX, 16(SP)
  2427  	MOVQ	CX, 24(SP)
  2428  	MOVQ	DX, 32(SP)
  2429  	// DI already saved
  2430  	MOVQ	SI, 40(SP)
  2431  	MOVQ	BP, 48(SP)
  2432  	MOVQ	R8, 56(SP)
  2433  	MOVQ	R9, 64(SP)
  2434  	MOVQ	R10, 72(SP)
  2435  	MOVQ	R11, 80(SP)
  2436  	MOVQ	R12, 88(SP)
  2437  	// R13 already saved
  2438  	// R14 already saved
  2439  	MOVQ	R15, 96(SP)
  2440  
  2441  	// This takes arguments DI and AX
  2442  	CALL	runtime·wbBufFlush(SB)
  2443  
  2444  	MOVQ	0(SP), DI
  2445  	MOVQ	8(SP), AX
  2446  	MOVQ	16(SP), BX
  2447  	MOVQ	24(SP), CX
  2448  	MOVQ	32(SP), DX
  2449  	MOVQ	40(SP), SI
  2450  	MOVQ	48(SP), BP
  2451  	MOVQ	56(SP), R8
  2452  	MOVQ	64(SP), R9
  2453  	MOVQ	72(SP), R10
  2454  	MOVQ	80(SP), R11
  2455  	MOVQ	88(SP), R12
  2456  	MOVQ	96(SP), R15
  2457  	JMP	ret