github.com/filosottile/go@v0.0.0-20170906193555-dbed9972d994/src/runtime/asm_amd64.s

github.com/filosottile/go@v0.0.0-20170906193555-dbed9972d994/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVL	$0, AX
    30  	CPUID
    31  	MOVL	AX, SI
    32  	CMPL	AX, $0
    33  	JE	nocpuinfo
    34  
    35  	// Figure out how to serialize RDTSC.
    36  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37  	// Don't know about the rest, so let's do MFENCE.
    38  	CMPL	BX, $0x756E6547  // "Genu"
    39  	JNE	notintel
    40  	CMPL	DX, $0x49656E69  // "ineI"
    41  	JNE	notintel
    42  	CMPL	CX, $0x6C65746E  // "ntel"
    43  	JNE	notintel
    44  	MOVB	$1, runtime·isIntel(SB)
    45  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    46  notintel:
    47  
    48  	// Load EAX=1 cpuid flags
    49  	MOVL	$1, AX
    50  	CPUID
    51  	MOVL	AX, runtime·processorVersionInfo(SB)
    52  
    53  	TESTL	$(1<<26), DX // SSE2
    54  	SETNE	runtime·support_sse2(SB)
    55  
    56  	TESTL	$(1<<9), CX // SSSE3
    57  	SETNE	runtime·support_ssse3(SB)
    58  
    59  	TESTL	$(1<<19), CX // SSE4.1
    60  	SETNE	runtime·support_sse41(SB)
    61  
    62  	TESTL	$(1<<20), CX // SSE4.2
    63  	SETNE	runtime·support_sse42(SB)
    64  
    65  	TESTL	$(1<<23), CX // POPCNT
    66  	SETNE	runtime·support_popcnt(SB)
    67  
    68  	TESTL	$(1<<25), CX // AES
    69  	SETNE	runtime·support_aes(SB)
    70  
    71  	TESTL	$(1<<27), CX // OSXSAVE
    72  	SETNE	runtime·support_osxsave(SB)
    73  
    74  	// If OS support for XMM and YMM is not present
    75  	// support_avx will be set back to false later.
    76  	TESTL	$(1<<28), CX // AVX
    77  	SETNE	runtime·support_avx(SB)
    78  
    79  eax7:
    80  	// Load EAX=7/ECX=0 cpuid flags
    81  	CMPL	SI, $7
    82  	JLT	osavx
    83  	MOVL	$7, AX
    84  	MOVL	$0, CX
    85  	CPUID
    86  
    87  	TESTL	$(1<<3), BX // BMI1
    88  	SETNE	runtime·support_bmi1(SB)
    89  
    90  	// If OS support for XMM and YMM is not present
    91  	// support_avx2 will be set back to false later.
    92  	TESTL	$(1<<5), BX
    93  	SETNE	runtime·support_avx2(SB)
    94  
    95  	TESTL	$(1<<8), BX // BMI2
    96  	SETNE	runtime·support_bmi2(SB)
    97  
    98  	TESTL	$(1<<9), BX // ERMS
    99  	SETNE	runtime·support_erms(SB)
   100  
   101  osavx:
   102  	CMPB	runtime·support_osxsave(SB), $1
   103  	JNE	noavx
   104  	MOVL	$0, CX
   105  	// For XGETBV, OSXSAVE bit is required and sufficient
   106  	XGETBV
   107  	ANDL	$6, AX
   108  	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
   109  	JE nocpuinfo
   110  noavx:
   111  	MOVB $0, runtime·support_avx(SB)
   112  	MOVB $0, runtime·support_avx2(SB)
   113  
   114  nocpuinfo:
   115  	// if there is an _cgo_init, call it.
   116  	MOVQ	_cgo_init(SB), AX
   117  	TESTQ	AX, AX
   118  	JZ	needtls
   119  	// g0 already in DI
   120  	MOVQ	DI, CX	// Win64 uses CX for first parameter
   121  	MOVQ	$setg_gcc<>(SB), SI
   122  	CALL	AX
   123  
   124  	// update stackguard after _cgo_init
   125  	MOVQ	$runtime·g0(SB), CX
   126  	MOVQ	(g_stack+stack_lo)(CX), AX
   127  	ADDQ	$const__StackGuard, AX
   128  	MOVQ	AX, g_stackguard0(CX)
   129  	MOVQ	AX, g_stackguard1(CX)
   130  
   131  #ifndef GOOS_windows
   132  	JMP ok
   133  #endif
   134  needtls:
   135  #ifdef GOOS_plan9
   136  	// skip TLS setup on Plan 9
   137  	JMP ok
   138  #endif
   139  #ifdef GOOS_solaris
   140  	// skip TLS setup on Solaris
   141  	JMP ok
   142  #endif
   143  
   144  	LEAQ	runtime·m0+m_tls(SB), DI
   145  	CALL	runtime·settls(SB)
   146  
   147  	// store through it, to make sure it works
   148  	get_tls(BX)
   149  	MOVQ	$0x123, g(BX)
   150  	MOVQ	runtime·m0+m_tls(SB), AX
   151  	CMPQ	AX, $0x123
   152  	JEQ 2(PC)
   153  	MOVL	AX, 0	// abort
   154  ok:
   155  	// set the per-goroutine and per-mach "registers"
   156  	get_tls(BX)
   157  	LEAQ	runtime·g0(SB), CX
   158  	MOVQ	CX, g(BX)
   159  	LEAQ	runtime·m0(SB), AX
   160  
   161  	// save m->g0 = g0
   162  	MOVQ	CX, m_g0(AX)
   163  	// save m0 to g0->m
   164  	MOVQ	AX, g_m(CX)
   165  
   166  	CLD				// convention is D is always left cleared
   167  	CALL	runtime·check(SB)
   168  
   169  	MOVL	16(SP), AX		// copy argc
   170  	MOVL	AX, 0(SP)
   171  	MOVQ	24(SP), AX		// copy argv
   172  	MOVQ	AX, 8(SP)
   173  	CALL	runtime·args(SB)
   174  	CALL	runtime·osinit(SB)
   175  	CALL	runtime·schedinit(SB)
   176  
   177  	// create a new goroutine to start program
   178  	MOVQ	$runtime·mainPC(SB), AX		// entry
   179  	PUSHQ	AX
   180  	PUSHQ	$0			// arg size
   181  	CALL	runtime·newproc(SB)
   182  	POPQ	AX
   183  	POPQ	AX
   184  
   185  	// start this M
   186  	CALL	runtime·mstart(SB)
   187  
   188  	MOVL	$0xf1, 0xf1  // crash
   189  	RET
   190  
   191  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   192  GLOBL	runtime·mainPC(SB),RODATA,$8
   193  
   194  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   195  	BYTE	$0xcc
   196  	RET
   197  
   198  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   199  	// No per-thread init.
   200  	RET
   201  
   202  /*
   203   *  go-routine
   204   */
   205  
   206  // void gosave(Gobuf*)
   207  // save state in Gobuf; setjmp
   208  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   209  	MOVQ	buf+0(FP), AX		// gobuf
   210  	LEAQ	buf+0(FP), BX		// caller's SP
   211  	MOVQ	BX, gobuf_sp(AX)
   212  	MOVQ	0(SP), BX		// caller's PC
   213  	MOVQ	BX, gobuf_pc(AX)
   214  	MOVQ	$0, gobuf_ret(AX)
   215  	MOVQ	BP, gobuf_bp(AX)
   216  	// Assert ctxt is zero. See func save.
   217  	MOVQ	gobuf_ctxt(AX), BX
   218  	TESTQ	BX, BX
   219  	JZ	2(PC)
   220  	CALL	runtime·badctxt(SB)
   221  	get_tls(CX)
   222  	MOVQ	g(CX), BX
   223  	MOVQ	BX, gobuf_g(AX)
   224  	RET
   225  
   226  // void gogo(Gobuf*)
   227  // restore state from Gobuf; longjmp
   228  TEXT runtime·gogo(SB), NOSPLIT, $16-8
   229  	MOVQ	buf+0(FP), BX		// gobuf
   230  
   231  	// If ctxt is not nil, invoke deletion barrier before overwriting.
   232  	MOVQ	gobuf_ctxt(BX), AX
   233  	TESTQ	AX, AX
   234  	JZ	nilctxt
   235  	LEAQ	gobuf_ctxt(BX), AX
   236  	MOVQ	AX, 0(SP)
   237  	MOVQ	$0, 8(SP)
   238  	CALL	runtime·writebarrierptr_prewrite(SB)
   239  	MOVQ	buf+0(FP), BX
   240  
   241  nilctxt:
   242  	MOVQ	gobuf_g(BX), DX
   243  	MOVQ	0(DX), CX		// make sure g != nil
   244  	get_tls(CX)
   245  	MOVQ	DX, g(CX)
   246  	MOVQ	gobuf_sp(BX), SP	// restore SP
   247  	MOVQ	gobuf_ret(BX), AX
   248  	MOVQ	gobuf_ctxt(BX), DX
   249  	MOVQ	gobuf_bp(BX), BP
   250  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   251  	MOVQ	$0, gobuf_ret(BX)
   252  	MOVQ	$0, gobuf_ctxt(BX)
   253  	MOVQ	$0, gobuf_bp(BX)
   254  	MOVQ	gobuf_pc(BX), BX
   255  	JMP	BX
   256  
   257  // func mcall(fn func(*g))
   258  // Switch to m->g0's stack, call fn(g).
   259  // Fn must never return. It should gogo(&g->sched)
   260  // to keep running g.
   261  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   262  	MOVQ	fn+0(FP), DI
   263  	
   264  	get_tls(CX)
   265  	MOVQ	g(CX), AX	// save state in g->sched
   266  	MOVQ	0(SP), BX	// caller's PC
   267  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   268  	LEAQ	fn+0(FP), BX	// caller's SP
   269  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   270  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   271  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   272  
   273  	// switch to m->g0 & its stack, call fn
   274  	MOVQ	g(CX), BX
   275  	MOVQ	g_m(BX), BX
   276  	MOVQ	m_g0(BX), SI
   277  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   278  	JNE	3(PC)
   279  	MOVQ	$runtime·badmcall(SB), AX
   280  	JMP	AX
   281  	MOVQ	SI, g(CX)	// g = m->g0
   282  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   283  	PUSHQ	AX
   284  	MOVQ	DI, DX
   285  	MOVQ	0(DI), DI
   286  	CALL	DI
   287  	POPQ	AX
   288  	MOVQ	$runtime·badmcall2(SB), AX
   289  	JMP	AX
   290  	RET
   291  
   292  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   293  // of the G stack. We need to distinguish the routine that
   294  // lives at the bottom of the G stack from the one that lives
   295  // at the top of the system stack because the one at the top of
   296  // the system stack terminates the stack walk (see topofstack()).
   297  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   298  	RET
   299  
   300  // func systemstack(fn func())
   301  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   302  	MOVQ	fn+0(FP), DI	// DI = fn
   303  	get_tls(CX)
   304  	MOVQ	g(CX), AX	// AX = g
   305  	MOVQ	g_m(AX), BX	// BX = m
   306  
   307  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   308  	CMPQ	AX, DX
   309  	JEQ	noswitch
   310  
   311  	MOVQ	m_g0(BX), DX	// DX = g0
   312  	CMPQ	AX, DX
   313  	JEQ	noswitch
   314  
   315  	MOVQ	m_curg(BX), R8
   316  	CMPQ	AX, R8
   317  	JEQ	switch
   318  	
   319  	// Bad: g is not gsignal, not g0, not curg. What is it?
   320  	MOVQ	$runtime·badsystemstack(SB), AX
   321  	CALL	AX
   322  
   323  switch:
   324  	// save our state in g->sched. Pretend to
   325  	// be systemstack_switch if the G stack is scanned.
   326  	MOVQ	$runtime·systemstack_switch(SB), SI
   327  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   328  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   329  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   330  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   331  
   332  	// switch to g0
   333  	MOVQ	DX, g(CX)
   334  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   335  	// make it look like mstart called systemstack on g0, to stop traceback
   336  	SUBQ	$8, BX
   337  	MOVQ	$runtime·mstart(SB), DX
   338  	MOVQ	DX, 0(BX)
   339  	MOVQ	BX, SP
   340  
   341  	// call target function
   342  	MOVQ	DI, DX
   343  	MOVQ	0(DI), DI
   344  	CALL	DI
   345  
   346  	// switch back to g
   347  	get_tls(CX)
   348  	MOVQ	g(CX), AX
   349  	MOVQ	g_m(AX), BX
   350  	MOVQ	m_curg(BX), AX
   351  	MOVQ	AX, g(CX)
   352  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   353  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   354  	RET
   355  
   356  noswitch:
   357  	// already on m stack, just call directly
   358  	MOVQ	DI, DX
   359  	MOVQ	0(DI), DI
   360  	CALL	DI
   361  	RET
   362  
   363  /*
   364   * support for morestack
   365   */
   366  
   367  // Called during function prolog when more stack is needed.
   368  //
   369  // The traceback routines see morestack on a g0 as being
   370  // the top of a stack (for example, morestack calling newstack
   371  // calling the scheduler calling newm calling gc), so we must
   372  // record an argument size. For that purpose, it has no arguments.
   373  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   374  	// Cannot grow scheduler stack (m->g0).
   375  	get_tls(CX)
   376  	MOVQ	g(CX), BX
   377  	MOVQ	g_m(BX), BX
   378  	MOVQ	m_g0(BX), SI
   379  	CMPQ	g(CX), SI
   380  	JNE	3(PC)
   381  	CALL	runtime·badmorestackg0(SB)
   382  	INT	$3
   383  
   384  	// Cannot grow signal stack (m->gsignal).
   385  	MOVQ	m_gsignal(BX), SI
   386  	CMPQ	g(CX), SI
   387  	JNE	3(PC)
   388  	CALL	runtime·badmorestackgsignal(SB)
   389  	INT	$3
   390  
   391  	// Called from f.
   392  	// Set m->morebuf to f's caller.
   393  	MOVQ	8(SP), AX	// f's caller's PC
   394  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   395  	LEAQ	16(SP), AX	// f's caller's SP
   396  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   397  	get_tls(CX)
   398  	MOVQ	g(CX), SI
   399  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   400  
   401  	// Set g->sched to context in f.
   402  	MOVQ	0(SP), AX // f's PC
   403  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   404  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   405  	LEAQ	8(SP), AX // f's SP
   406  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   407  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   408  	// newstack will fill gobuf.ctxt.
   409  
   410  	// Call newstack on m->g0's stack.
   411  	MOVQ	m_g0(BX), BX
   412  	MOVQ	BX, g(CX)
   413  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   414  	PUSHQ	DX	// ctxt argument
   415  	CALL	runtime·newstack(SB)
   416  	MOVQ	$0, 0x1003	// crash if newstack returns
   417  	POPQ	DX	// keep balance check happy
   418  	RET
   419  
   420  // morestack but not preserving ctxt.
   421  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   422  	MOVL	$0, DX
   423  	JMP	runtime·morestack(SB)
   424  
   425  // reflectcall: call a function with the given argument list
   426  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   427  // we don't have variable-sized frames, so we use a small number
   428  // of constant-sized-frame functions to encode a few bits of size in the pc.
   429  // Caution: ugly multiline assembly macros in your future!
   430  
   431  #define DISPATCH(NAME,MAXSIZE)		\
   432  	CMPQ	CX, $MAXSIZE;		\
   433  	JA	3(PC);			\
   434  	MOVQ	$NAME(SB), AX;		\
   435  	JMP	AX
   436  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   437  
   438  TEXT reflect·call(SB), NOSPLIT, $0-0
   439  	JMP	·reflectcall(SB)
   440  
   441  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   442  	MOVLQZX argsize+24(FP), CX
   443  	DISPATCH(runtime·call32, 32)
   444  	DISPATCH(runtime·call64, 64)
   445  	DISPATCH(runtime·call128, 128)
   446  	DISPATCH(runtime·call256, 256)
   447  	DISPATCH(runtime·call512, 512)
   448  	DISPATCH(runtime·call1024, 1024)
   449  	DISPATCH(runtime·call2048, 2048)
   450  	DISPATCH(runtime·call4096, 4096)
   451  	DISPATCH(runtime·call8192, 8192)
   452  	DISPATCH(runtime·call16384, 16384)
   453  	DISPATCH(runtime·call32768, 32768)
   454  	DISPATCH(runtime·call65536, 65536)
   455  	DISPATCH(runtime·call131072, 131072)
   456  	DISPATCH(runtime·call262144, 262144)
   457  	DISPATCH(runtime·call524288, 524288)
   458  	DISPATCH(runtime·call1048576, 1048576)
   459  	DISPATCH(runtime·call2097152, 2097152)
   460  	DISPATCH(runtime·call4194304, 4194304)
   461  	DISPATCH(runtime·call8388608, 8388608)
   462  	DISPATCH(runtime·call16777216, 16777216)
   463  	DISPATCH(runtime·call33554432, 33554432)
   464  	DISPATCH(runtime·call67108864, 67108864)
   465  	DISPATCH(runtime·call134217728, 134217728)
   466  	DISPATCH(runtime·call268435456, 268435456)
   467  	DISPATCH(runtime·call536870912, 536870912)
   468  	DISPATCH(runtime·call1073741824, 1073741824)
   469  	MOVQ	$runtime·badreflectcall(SB), AX
   470  	JMP	AX
   471  
   472  #define CALLFN(NAME,MAXSIZE)			\
   473  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   474  	NO_LOCAL_POINTERS;			\
   475  	/* copy arguments to stack */		\
   476  	MOVQ	argptr+16(FP), SI;		\
   477  	MOVLQZX argsize+24(FP), CX;		\
   478  	MOVQ	SP, DI;				\
   479  	REP;MOVSB;				\
   480  	/* call function */			\
   481  	MOVQ	f+8(FP), DX;			\
   482  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   483  	CALL	(DX);				\
   484  	/* copy return values back */		\
   485  	MOVQ	argtype+0(FP), DX;		\
   486  	MOVQ	argptr+16(FP), DI;		\
   487  	MOVLQZX	argsize+24(FP), CX;		\
   488  	MOVLQZX	retoffset+28(FP), BX;		\
   489  	MOVQ	SP, SI;				\
   490  	ADDQ	BX, DI;				\
   491  	ADDQ	BX, SI;				\
   492  	SUBQ	BX, CX;				\
   493  	CALL	callRet<>(SB);			\
   494  	RET
   495  
   496  // callRet copies return values back at the end of call*. This is a
   497  // separate function so it can allocate stack space for the arguments
   498  // to reflectcallmove. It does not follow the Go ABI; it expects its
   499  // arguments in registers.
   500  TEXT callRet<>(SB), NOSPLIT, $32-0
   501  	NO_LOCAL_POINTERS
   502  	MOVQ	DX, 0(SP)
   503  	MOVQ	DI, 8(SP)
   504  	MOVQ	SI, 16(SP)
   505  	MOVQ	CX, 24(SP)
   506  	CALL	runtime·reflectcallmove(SB)
   507  	RET
   508  
   509  CALLFN(·call32, 32)
   510  CALLFN(·call64, 64)
   511  CALLFN(·call128, 128)
   512  CALLFN(·call256, 256)
   513  CALLFN(·call512, 512)
   514  CALLFN(·call1024, 1024)
   515  CALLFN(·call2048, 2048)
   516  CALLFN(·call4096, 4096)
   517  CALLFN(·call8192, 8192)
   518  CALLFN(·call16384, 16384)
   519  CALLFN(·call32768, 32768)
   520  CALLFN(·call65536, 65536)
   521  CALLFN(·call131072, 131072)
   522  CALLFN(·call262144, 262144)
   523  CALLFN(·call524288, 524288)
   524  CALLFN(·call1048576, 1048576)
   525  CALLFN(·call2097152, 2097152)
   526  CALLFN(·call4194304, 4194304)
   527  CALLFN(·call8388608, 8388608)
   528  CALLFN(·call16777216, 16777216)
   529  CALLFN(·call33554432, 33554432)
   530  CALLFN(·call67108864, 67108864)
   531  CALLFN(·call134217728, 134217728)
   532  CALLFN(·call268435456, 268435456)
   533  CALLFN(·call536870912, 536870912)
   534  CALLFN(·call1073741824, 1073741824)
   535  
   536  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   537  	MOVL	cycles+0(FP), AX
   538  again:
   539  	PAUSE
   540  	SUBL	$1, AX
   541  	JNZ	again
   542  	RET
   543  
   544  
   545  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   546  	// Stores are already ordered on x86, so this is just a
   547  	// compile barrier.
   548  	RET
   549  
   550  // void jmpdefer(fn, sp);
   551  // called from deferreturn.
   552  // 1. pop the caller
   553  // 2. sub 5 bytes from the callers return
   554  // 3. jmp to the argument
   555  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   556  	MOVQ	fv+0(FP), DX	// fn
   557  	MOVQ	argp+8(FP), BX	// caller sp
   558  	LEAQ	-8(BX), SP	// caller sp after CALL
   559  	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   560  	SUBQ	$5, (SP)	// return to CALL again
   561  	MOVQ	0(DX), BX
   562  	JMP	BX	// but first run the deferred function
   563  
   564  // Save state of caller into g->sched. Smashes R8, R9.
   565  TEXT gosave<>(SB),NOSPLIT,$0
   566  	get_tls(R8)
   567  	MOVQ	g(R8), R8
   568  	MOVQ	0(SP), R9
   569  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   570  	LEAQ	8(SP), R9
   571  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   572  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   573  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   574  	// Assert ctxt is zero. See func save.
   575  	MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   576  	TESTQ	R9, R9
   577  	JZ	2(PC)
   578  	CALL	runtime·badctxt(SB)
   579  	RET
   580  
   581  // func asmcgocall(fn, arg unsafe.Pointer) int32
   582  // Call fn(arg) on the scheduler stack,
   583  // aligned appropriately for the gcc ABI.
   584  // See cgocall.go for more details.
   585  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   586  	MOVQ	fn+0(FP), AX
   587  	MOVQ	arg+8(FP), BX
   588  
   589  	MOVQ	SP, DX
   590  
   591  	// Figure out if we need to switch to m->g0 stack.
   592  	// We get called to create new OS threads too, and those
   593  	// come in on the m->g0 stack already.
   594  	get_tls(CX)
   595  	MOVQ	g(CX), R8
   596  	CMPQ	R8, $0
   597  	JEQ	nosave
   598  	MOVQ	g_m(R8), R8
   599  	MOVQ	m_g0(R8), SI
   600  	MOVQ	g(CX), DI
   601  	CMPQ	SI, DI
   602  	JEQ	nosave
   603  	MOVQ	m_gsignal(R8), SI
   604  	CMPQ	SI, DI
   605  	JEQ	nosave
   606  	
   607  	// Switch to system stack.
   608  	MOVQ	m_g0(R8), SI
   609  	CALL	gosave<>(SB)
   610  	MOVQ	SI, g(CX)
   611  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   612  
   613  	// Now on a scheduling stack (a pthread-created stack).
   614  	// Make sure we have enough room for 4 stack-backed fast-call
   615  	// registers as per windows amd64 calling convention.
   616  	SUBQ	$64, SP
   617  	ANDQ	$~15, SP	// alignment for gcc ABI
   618  	MOVQ	DI, 48(SP)	// save g
   619  	MOVQ	(g_stack+stack_hi)(DI), DI
   620  	SUBQ	DX, DI
   621  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   622  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   623  	MOVQ	BX, CX		// CX = first argument in Win64
   624  	CALL	AX
   625  
   626  	// Restore registers, g, stack pointer.
   627  	get_tls(CX)
   628  	MOVQ	48(SP), DI
   629  	MOVQ	(g_stack+stack_hi)(DI), SI
   630  	SUBQ	40(SP), SI
   631  	MOVQ	DI, g(CX)
   632  	MOVQ	SI, SP
   633  
   634  	MOVL	AX, ret+16(FP)
   635  	RET
   636  
   637  nosave:
   638  	// Running on a system stack, perhaps even without a g.
   639  	// Having no g can happen during thread creation or thread teardown
   640  	// (see needm/dropm on Solaris, for example).
   641  	// This code is like the above sequence but without saving/restoring g
   642  	// and without worrying about the stack moving out from under us
   643  	// (because we're on a system stack, not a goroutine stack).
   644  	// The above code could be used directly if already on a system stack,
   645  	// but then the only path through this code would be a rare case on Solaris.
   646  	// Using this code for all "already on system stack" calls exercises it more,
   647  	// which should help keep it correct.
   648  	SUBQ	$64, SP
   649  	ANDQ	$~15, SP
   650  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   651  	MOVQ	DX, 40(SP)	// save original stack pointer
   652  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   653  	MOVQ	BX, CX		// CX = first argument in Win64
   654  	CALL	AX
   655  	MOVQ	40(SP), SI	// restore original stack pointer
   656  	MOVQ	SI, SP
   657  	MOVL	AX, ret+16(FP)
   658  	RET
   659  
   660  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   661  // Turn the fn into a Go func (by taking its address) and call
   662  // cgocallback_gofunc.
   663  TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   664  	LEAQ	fn+0(FP), AX
   665  	MOVQ	AX, 0(SP)
   666  	MOVQ	frame+8(FP), AX
   667  	MOVQ	AX, 8(SP)
   668  	MOVQ	framesize+16(FP), AX
   669  	MOVQ	AX, 16(SP)
   670  	MOVQ	ctxt+24(FP), AX
   671  	MOVQ	AX, 24(SP)
   672  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   673  	CALL	AX
   674  	RET
   675  
   676  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   677  // See cgocall.go for more details.
   678  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   679  	NO_LOCAL_POINTERS
   680  
   681  	// If g is nil, Go did not create the current thread.
   682  	// Call needm to obtain one m for temporary use.
   683  	// In this case, we're running on the thread stack, so there's
   684  	// lots of space, but the linker doesn't know. Hide the call from
   685  	// the linker analysis by using an indirect call through AX.
   686  	get_tls(CX)
   687  #ifdef GOOS_windows
   688  	MOVL	$0, BX
   689  	CMPQ	CX, $0
   690  	JEQ	2(PC)
   691  #endif
   692  	MOVQ	g(CX), BX
   693  	CMPQ	BX, $0
   694  	JEQ	needm
   695  	MOVQ	g_m(BX), BX
   696  	MOVQ	BX, R8 // holds oldm until end of function
   697  	JMP	havem
   698  needm:
   699  	MOVQ	$0, 0(SP)
   700  	MOVQ	$runtime·needm(SB), AX
   701  	CALL	AX
   702  	MOVQ	0(SP), R8
   703  	get_tls(CX)
   704  	MOVQ	g(CX), BX
   705  	MOVQ	g_m(BX), BX
   706  	
   707  	// Set m->sched.sp = SP, so that if a panic happens
   708  	// during the function we are about to execute, it will
   709  	// have a valid SP to run on the g0 stack.
   710  	// The next few lines (after the havem label)
   711  	// will save this SP onto the stack and then write
   712  	// the same SP back to m->sched.sp. That seems redundant,
   713  	// but if an unrecovered panic happens, unwindm will
   714  	// restore the g->sched.sp from the stack location
   715  	// and then systemstack will try to use it. If we don't set it here,
   716  	// that restored SP will be uninitialized (typically 0) and
   717  	// will not be usable.
   718  	MOVQ	m_g0(BX), SI
   719  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   720  
   721  havem:
   722  	// Now there's a valid m, and we're running on its m->g0.
   723  	// Save current m->g0->sched.sp on stack and then set it to SP.
   724  	// Save current sp in m->g0->sched.sp in preparation for
   725  	// switch back to m->curg stack.
   726  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   727  	MOVQ	m_g0(BX), SI
   728  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   729  	MOVQ	AX, 0(SP)
   730  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   731  
   732  	// Switch to m->curg stack and call runtime.cgocallbackg.
   733  	// Because we are taking over the execution of m->curg
   734  	// but *not* resuming what had been running, we need to
   735  	// save that information (m->curg->sched) so we can restore it.
   736  	// We can restore m->curg->sched.sp easily, because calling
   737  	// runtime.cgocallbackg leaves SP unchanged upon return.
   738  	// To save m->curg->sched.pc, we push it onto the stack.
   739  	// This has the added benefit that it looks to the traceback
   740  	// routine like cgocallbackg is going to return to that
   741  	// PC (because the frame we allocate below has the same
   742  	// size as cgocallback_gofunc's frame declared above)
   743  	// so that the traceback will seamlessly trace back into
   744  	// the earlier calls.
   745  	//
   746  	// In the new goroutine, 8(SP) holds the saved R8.
   747  	MOVQ	m_curg(BX), SI
   748  	MOVQ	SI, g(CX)
   749  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   750  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   751  	MOVQ	BX, -8(DI)
   752  	// Compute the size of the frame, including return PC and, if
   753  	// GOEXPERIMENT=framepointer, the saved base pointer
   754  	MOVQ	ctxt+24(FP), BX
   755  	LEAQ	fv+0(FP), AX
   756  	SUBQ	SP, AX
   757  	SUBQ	AX, DI
   758  	MOVQ	DI, SP
   759  
   760  	MOVQ	R8, 8(SP)
   761  	MOVQ	BX, 0(SP)
   762  	CALL	runtime·cgocallbackg(SB)
   763  	MOVQ	8(SP), R8
   764  
   765  	// Compute the size of the frame again. FP and SP have
   766  	// completely different values here than they did above,
   767  	// but only their difference matters.
   768  	LEAQ	fv+0(FP), AX
   769  	SUBQ	SP, AX
   770  
   771  	// Restore g->sched (== m->curg->sched) from saved values.
   772  	get_tls(CX)
   773  	MOVQ	g(CX), SI
   774  	MOVQ	SP, DI
   775  	ADDQ	AX, DI
   776  	MOVQ	-8(DI), BX
   777  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   778  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   779  
   780  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   781  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   782  	// so we do not have to restore it.)
   783  	MOVQ	g(CX), BX
   784  	MOVQ	g_m(BX), BX
   785  	MOVQ	m_g0(BX), SI
   786  	MOVQ	SI, g(CX)
   787  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   788  	MOVQ	0(SP), AX
   789  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   790  	
   791  	// If the m on entry was nil, we called needm above to borrow an m
   792  	// for the duration of the call. Since the call is over, return it with dropm.
   793  	CMPQ	R8, $0
   794  	JNE 3(PC)
   795  	MOVQ	$runtime·dropm(SB), AX
   796  	CALL	AX
   797  
   798  	// Done!
   799  	RET
   800  
   801  // void setg(G*); set g. for use by needm.
   802  TEXT runtime·setg(SB), NOSPLIT, $0-8
   803  	MOVQ	gg+0(FP), BX
   804  #ifdef GOOS_windows
   805  	CMPQ	BX, $0
   806  	JNE	settls
   807  	MOVQ	$0, 0x28(GS)
   808  	RET
   809  settls:
   810  	MOVQ	g_m(BX), AX
   811  	LEAQ	m_tls(AX), AX
   812  	MOVQ	AX, 0x28(GS)
   813  #endif
   814  	get_tls(CX)
   815  	MOVQ	BX, g(CX)
   816  	RET
   817  
   818  // void setg_gcc(G*); set g called from gcc.
   819  TEXT setg_gcc<>(SB),NOSPLIT,$0
   820  	get_tls(AX)
   821  	MOVQ	DI, g(AX)
   822  	RET
   823  
   824  // check that SP is in range [g->stack.lo, g->stack.hi)
   825  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   826  	get_tls(CX)
   827  	MOVQ	g(CX), AX
   828  	CMPQ	(g_stack+stack_hi)(AX), SP
   829  	JHI	2(PC)
   830  	INT	$3
   831  	CMPQ	SP, (g_stack+stack_lo)(AX)
   832  	JHI	2(PC)
   833  	INT	$3
   834  	RET
   835  
   836  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   837  	MOVQ	argp+0(FP),AX		// addr of first arg
   838  	MOVQ	-8(AX),AX		// get calling pc
   839  	MOVQ	AX, ret+8(FP)
   840  	RET
   841  
   842  // func cputicks() int64
   843  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   844  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   845  	JNE	mfence
   846  	LFENCE
   847  	JMP	done
   848  mfence:
   849  	MFENCE
   850  done:
   851  	RDTSC
   852  	SHLQ	$32, DX
   853  	ADDQ	DX, AX
   854  	MOVQ	AX, ret+0(FP)
   855  	RET
   856  
   857  // hash function using AES hardware instructions
   858  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   859  	MOVQ	p+0(FP), AX	// ptr to data
   860  	MOVQ	s+16(FP), CX	// size
   861  	LEAQ	ret+24(FP), DX
   862  	JMP	runtime·aeshashbody(SB)
   863  
   864  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   865  	MOVQ	p+0(FP), AX	// ptr to string struct
   866  	MOVQ	8(AX), CX	// length of string
   867  	MOVQ	(AX), AX	// string data
   868  	LEAQ	ret+16(FP), DX
   869  	JMP	runtime·aeshashbody(SB)
   870  
   871  // AX: data
   872  // CX: length
   873  // DX: address to put return value
   874  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   875  	// Fill an SSE register with our seeds.
   876  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   877  	PINSRW	$4, CX, X0			// 16 bits of length
   878  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   879  	MOVO	X0, X1				// save unscrambled seed
   880  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   881  	AESENC	X0, X0				// scramble seed
   882  
   883  	CMPQ	CX, $16
   884  	JB	aes0to15
   885  	JE	aes16
   886  	CMPQ	CX, $32
   887  	JBE	aes17to32
   888  	CMPQ	CX, $64
   889  	JBE	aes33to64
   890  	CMPQ	CX, $128
   891  	JBE	aes65to128
   892  	JMP	aes129plus
   893  
   894  aes0to15:
   895  	TESTQ	CX, CX
   896  	JE	aes0
   897  
   898  	ADDQ	$16, AX
   899  	TESTW	$0xff0, AX
   900  	JE	endofpage
   901  
   902  	// 16 bytes loaded at this address won't cross
   903  	// a page boundary, so we can load it directly.
   904  	MOVOU	-16(AX), X1
   905  	ADDQ	CX, CX
   906  	MOVQ	$masks<>(SB), AX
   907  	PAND	(AX)(CX*8), X1
   908  final1:
   909  	PXOR	X0, X1	// xor data with seed
   910  	AESENC	X1, X1	// scramble combo 3 times
   911  	AESENC	X1, X1
   912  	AESENC	X1, X1
   913  	MOVQ	X1, (DX)
   914  	RET
   915  
   916  endofpage:
   917  	// address ends in 1111xxxx. Might be up against
   918  	// a page boundary, so load ending at last byte.
   919  	// Then shift bytes down using pshufb.
   920  	MOVOU	-32(AX)(CX*1), X1
   921  	ADDQ	CX, CX
   922  	MOVQ	$shifts<>(SB), AX
   923  	PSHUFB	(AX)(CX*8), X1
   924  	JMP	final1
   925  
   926  aes0:
   927  	// Return scrambled input seed
   928  	AESENC	X0, X0
   929  	MOVQ	X0, (DX)
   930  	RET
   931  
   932  aes16:
   933  	MOVOU	(AX), X1
   934  	JMP	final1
   935  
   936  aes17to32:
   937  	// make second starting seed
   938  	PXOR	runtime·aeskeysched+16(SB), X1
   939  	AESENC	X1, X1
   940  	
   941  	// load data to be hashed
   942  	MOVOU	(AX), X2
   943  	MOVOU	-16(AX)(CX*1), X3
   944  
   945  	// xor with seed
   946  	PXOR	X0, X2
   947  	PXOR	X1, X3
   948  
   949  	// scramble 3 times
   950  	AESENC	X2, X2
   951  	AESENC	X3, X3
   952  	AESENC	X2, X2
   953  	AESENC	X3, X3
   954  	AESENC	X2, X2
   955  	AESENC	X3, X3
   956  
   957  	// combine results
   958  	PXOR	X3, X2
   959  	MOVQ	X2, (DX)
   960  	RET
   961  
   962  aes33to64:
   963  	// make 3 more starting seeds
   964  	MOVO	X1, X2
   965  	MOVO	X1, X3
   966  	PXOR	runtime·aeskeysched+16(SB), X1
   967  	PXOR	runtime·aeskeysched+32(SB), X2
   968  	PXOR	runtime·aeskeysched+48(SB), X3
   969  	AESENC	X1, X1
   970  	AESENC	X2, X2
   971  	AESENC	X3, X3
   972  	
   973  	MOVOU	(AX), X4
   974  	MOVOU	16(AX), X5
   975  	MOVOU	-32(AX)(CX*1), X6
   976  	MOVOU	-16(AX)(CX*1), X7
   977  
   978  	PXOR	X0, X4
   979  	PXOR	X1, X5
   980  	PXOR	X2, X6
   981  	PXOR	X3, X7
   982  	
   983  	AESENC	X4, X4
   984  	AESENC	X5, X5
   985  	AESENC	X6, X6
   986  	AESENC	X7, X7
   987  	
   988  	AESENC	X4, X4
   989  	AESENC	X5, X5
   990  	AESENC	X6, X6
   991  	AESENC	X7, X7
   992  	
   993  	AESENC	X4, X4
   994  	AESENC	X5, X5
   995  	AESENC	X6, X6
   996  	AESENC	X7, X7
   997  
   998  	PXOR	X6, X4
   999  	PXOR	X7, X5
  1000  	PXOR	X5, X4
  1001  	MOVQ	X4, (DX)
  1002  	RET
  1003  
  1004  aes65to128:
  1005  	// make 7 more starting seeds
  1006  	MOVO	X1, X2
  1007  	MOVO	X1, X3
  1008  	MOVO	X1, X4
  1009  	MOVO	X1, X5
  1010  	MOVO	X1, X6
  1011  	MOVO	X1, X7
  1012  	PXOR	runtime·aeskeysched+16(SB), X1
  1013  	PXOR	runtime·aeskeysched+32(SB), X2
  1014  	PXOR	runtime·aeskeysched+48(SB), X3
  1015  	PXOR	runtime·aeskeysched+64(SB), X4
  1016  	PXOR	runtime·aeskeysched+80(SB), X5
  1017  	PXOR	runtime·aeskeysched+96(SB), X6
  1018  	PXOR	runtime·aeskeysched+112(SB), X7
  1019  	AESENC	X1, X1
  1020  	AESENC	X2, X2
  1021  	AESENC	X3, X3
  1022  	AESENC	X4, X4
  1023  	AESENC	X5, X5
  1024  	AESENC	X6, X6
  1025  	AESENC	X7, X7
  1026  
  1027  	// load data
  1028  	MOVOU	(AX), X8
  1029  	MOVOU	16(AX), X9
  1030  	MOVOU	32(AX), X10
  1031  	MOVOU	48(AX), X11
  1032  	MOVOU	-64(AX)(CX*1), X12
  1033  	MOVOU	-48(AX)(CX*1), X13
  1034  	MOVOU	-32(AX)(CX*1), X14
  1035  	MOVOU	-16(AX)(CX*1), X15
  1036  
  1037  	// xor with seed
  1038  	PXOR	X0, X8
  1039  	PXOR	X1, X9
  1040  	PXOR	X2, X10
  1041  	PXOR	X3, X11
  1042  	PXOR	X4, X12
  1043  	PXOR	X5, X13
  1044  	PXOR	X6, X14
  1045  	PXOR	X7, X15
  1046  
  1047  	// scramble 3 times
  1048  	AESENC	X8, X8
  1049  	AESENC	X9, X9
  1050  	AESENC	X10, X10
  1051  	AESENC	X11, X11
  1052  	AESENC	X12, X12
  1053  	AESENC	X13, X13
  1054  	AESENC	X14, X14
  1055  	AESENC	X15, X15
  1056  
  1057  	AESENC	X8, X8
  1058  	AESENC	X9, X9
  1059  	AESENC	X10, X10
  1060  	AESENC	X11, X11
  1061  	AESENC	X12, X12
  1062  	AESENC	X13, X13
  1063  	AESENC	X14, X14
  1064  	AESENC	X15, X15
  1065  
  1066  	AESENC	X8, X8
  1067  	AESENC	X9, X9
  1068  	AESENC	X10, X10
  1069  	AESENC	X11, X11
  1070  	AESENC	X12, X12
  1071  	AESENC	X13, X13
  1072  	AESENC	X14, X14
  1073  	AESENC	X15, X15
  1074  
  1075  	// combine results
  1076  	PXOR	X12, X8
  1077  	PXOR	X13, X9
  1078  	PXOR	X14, X10
  1079  	PXOR	X15, X11
  1080  	PXOR	X10, X8
  1081  	PXOR	X11, X9
  1082  	PXOR	X9, X8
  1083  	MOVQ	X8, (DX)
  1084  	RET
  1085  
  1086  aes129plus:
  1087  	// make 7 more starting seeds
  1088  	MOVO	X1, X2
  1089  	MOVO	X1, X3
  1090  	MOVO	X1, X4
  1091  	MOVO	X1, X5
  1092  	MOVO	X1, X6
  1093  	MOVO	X1, X7
  1094  	PXOR	runtime·aeskeysched+16(SB), X1
  1095  	PXOR	runtime·aeskeysched+32(SB), X2
  1096  	PXOR	runtime·aeskeysched+48(SB), X3
  1097  	PXOR	runtime·aeskeysched+64(SB), X4
  1098  	PXOR	runtime·aeskeysched+80(SB), X5
  1099  	PXOR	runtime·aeskeysched+96(SB), X6
  1100  	PXOR	runtime·aeskeysched+112(SB), X7
  1101  	AESENC	X1, X1
  1102  	AESENC	X2, X2
  1103  	AESENC	X3, X3
  1104  	AESENC	X4, X4
  1105  	AESENC	X5, X5
  1106  	AESENC	X6, X6
  1107  	AESENC	X7, X7
  1108  	
  1109  	// start with last (possibly overlapping) block
  1110  	MOVOU	-128(AX)(CX*1), X8
  1111  	MOVOU	-112(AX)(CX*1), X9
  1112  	MOVOU	-96(AX)(CX*1), X10
  1113  	MOVOU	-80(AX)(CX*1), X11
  1114  	MOVOU	-64(AX)(CX*1), X12
  1115  	MOVOU	-48(AX)(CX*1), X13
  1116  	MOVOU	-32(AX)(CX*1), X14
  1117  	MOVOU	-16(AX)(CX*1), X15
  1118  
  1119  	// xor in seed
  1120  	PXOR	X0, X8
  1121  	PXOR	X1, X9
  1122  	PXOR	X2, X10
  1123  	PXOR	X3, X11
  1124  	PXOR	X4, X12
  1125  	PXOR	X5, X13
  1126  	PXOR	X6, X14
  1127  	PXOR	X7, X15
  1128  	
  1129  	// compute number of remaining 128-byte blocks
  1130  	DECQ	CX
  1131  	SHRQ	$7, CX
  1132  	
  1133  aesloop:
  1134  	// scramble state
  1135  	AESENC	X8, X8
  1136  	AESENC	X9, X9
  1137  	AESENC	X10, X10
  1138  	AESENC	X11, X11
  1139  	AESENC	X12, X12
  1140  	AESENC	X13, X13
  1141  	AESENC	X14, X14
  1142  	AESENC	X15, X15
  1143  
  1144  	// scramble state, xor in a block
  1145  	MOVOU	(AX), X0
  1146  	MOVOU	16(AX), X1
  1147  	MOVOU	32(AX), X2
  1148  	MOVOU	48(AX), X3
  1149  	AESENC	X0, X8
  1150  	AESENC	X1, X9
  1151  	AESENC	X2, X10
  1152  	AESENC	X3, X11
  1153  	MOVOU	64(AX), X4
  1154  	MOVOU	80(AX), X5
  1155  	MOVOU	96(AX), X6
  1156  	MOVOU	112(AX), X7
  1157  	AESENC	X4, X12
  1158  	AESENC	X5, X13
  1159  	AESENC	X6, X14
  1160  	AESENC	X7, X15
  1161  
  1162  	ADDQ	$128, AX
  1163  	DECQ	CX
  1164  	JNE	aesloop
  1165  
  1166  	// 3 more scrambles to finish
  1167  	AESENC	X8, X8
  1168  	AESENC	X9, X9
  1169  	AESENC	X10, X10
  1170  	AESENC	X11, X11
  1171  	AESENC	X12, X12
  1172  	AESENC	X13, X13
  1173  	AESENC	X14, X14
  1174  	AESENC	X15, X15
  1175  	AESENC	X8, X8
  1176  	AESENC	X9, X9
  1177  	AESENC	X10, X10
  1178  	AESENC	X11, X11
  1179  	AESENC	X12, X12
  1180  	AESENC	X13, X13
  1181  	AESENC	X14, X14
  1182  	AESENC	X15, X15
  1183  	AESENC	X8, X8
  1184  	AESENC	X9, X9
  1185  	AESENC	X10, X10
  1186  	AESENC	X11, X11
  1187  	AESENC	X12, X12
  1188  	AESENC	X13, X13
  1189  	AESENC	X14, X14
  1190  	AESENC	X15, X15
  1191  
  1192  	PXOR	X12, X8
  1193  	PXOR	X13, X9
  1194  	PXOR	X14, X10
  1195  	PXOR	X15, X11
  1196  	PXOR	X10, X8
  1197  	PXOR	X11, X9
  1198  	PXOR	X9, X8
  1199  	MOVQ	X8, (DX)
  1200  	RET
  1201  	
  1202  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1203  	MOVQ	p+0(FP), AX	// ptr to data
  1204  	MOVQ	h+8(FP), X0	// seed
  1205  	PINSRD	$2, (AX), X0	// data
  1206  	AESENC	runtime·aeskeysched+0(SB), X0
  1207  	AESENC	runtime·aeskeysched+16(SB), X0
  1208  	AESENC	runtime·aeskeysched+32(SB), X0
  1209  	MOVQ	X0, ret+16(FP)
  1210  	RET
  1211  
  1212  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1213  	MOVQ	p+0(FP), AX	// ptr to data
  1214  	MOVQ	h+8(FP), X0	// seed
  1215  	PINSRQ	$1, (AX), X0	// data
  1216  	AESENC	runtime·aeskeysched+0(SB), X0
  1217  	AESENC	runtime·aeskeysched+16(SB), X0
  1218  	AESENC	runtime·aeskeysched+32(SB), X0
  1219  	MOVQ	X0, ret+16(FP)
  1220  	RET
  1221  
  1222  // simple mask to get rid of data in the high part of the register.
  1223  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1224  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1225  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1226  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1227  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1228  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1229  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1230  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1231  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1232  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1233  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1234  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1235  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1236  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1237  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1238  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1239  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1240  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1241  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1242  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1243  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1244  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1245  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1246  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1247  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1248  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1249  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1250  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1251  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1252  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1253  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1254  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1255  GLOBL masks<>(SB),RODATA,$256
  1256  
  1257  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1258  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1259  	MOVQ	$masks<>(SB), AX
  1260  	MOVQ	$shifts<>(SB), BX
  1261  	ORQ	BX, AX
  1262  	TESTQ	$15, AX
  1263  	SETEQ	ret+0(FP)
  1264  	RET
  1265  
  1266  // these are arguments to pshufb. They move data down from
  1267  // the high bytes of the register to the low bytes of the register.
  1268  // index is how many bytes to move.
  1269  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1270  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1271  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1272  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1273  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1274  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1275  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1276  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1277  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1278  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1279  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1280  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1281  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1282  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1283  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1284  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1285  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1286  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1287  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1288  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1289  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1290  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1291  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1292  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1293  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1294  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1295  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1296  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1297  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1298  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1299  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1300  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1301  GLOBL shifts<>(SB),RODATA,$256
  1302  
  1303  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1304  TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1305  	MOVQ	a+0(FP), SI
  1306  	MOVQ	b+8(FP), DI
  1307  	CMPQ	SI, DI
  1308  	JEQ	eq
  1309  	MOVQ	size+16(FP), BX
  1310  	LEAQ	ret+24(FP), AX
  1311  	JMP	runtime·memeqbody(SB)
  1312  eq:
  1313  	MOVB	$1, ret+24(FP)
  1314  	RET
  1315  
  1316  // memequal_varlen(a, b unsafe.Pointer) bool
  1317  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1318  	MOVQ	a+0(FP), SI
  1319  	MOVQ	b+8(FP), DI
  1320  	CMPQ	SI, DI
  1321  	JEQ	eq
  1322  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1323  	LEAQ	ret+16(FP), AX
  1324  	JMP	runtime·memeqbody(SB)
  1325  eq:
  1326  	MOVB	$1, ret+16(FP)
  1327  	RET
  1328  
  1329  // a in SI
  1330  // b in DI
  1331  // count in BX
  1332  // address of result byte in AX
  1333  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1334  	CMPQ	BX, $8
  1335  	JB	small
  1336  	CMPQ	BX, $64
  1337  	JB	bigloop
  1338  	CMPB    runtime·support_avx2(SB), $1
  1339  	JE	hugeloop_avx2
  1340  	
  1341  	// 64 bytes at a time using xmm registers
  1342  hugeloop:
  1343  	CMPQ	BX, $64
  1344  	JB	bigloop
  1345  	MOVOU	(SI), X0
  1346  	MOVOU	(DI), X1
  1347  	MOVOU	16(SI), X2
  1348  	MOVOU	16(DI), X3
  1349  	MOVOU	32(SI), X4
  1350  	MOVOU	32(DI), X5
  1351  	MOVOU	48(SI), X6
  1352  	MOVOU	48(DI), X7
  1353  	PCMPEQB	X1, X0
  1354  	PCMPEQB	X3, X2
  1355  	PCMPEQB	X5, X4
  1356  	PCMPEQB	X7, X6
  1357  	PAND	X2, X0
  1358  	PAND	X6, X4
  1359  	PAND	X4, X0
  1360  	PMOVMSKB X0, DX
  1361  	ADDQ	$64, SI
  1362  	ADDQ	$64, DI
  1363  	SUBQ	$64, BX
  1364  	CMPL	DX, $0xffff
  1365  	JEQ	hugeloop
  1366  	MOVB	$0, (AX)
  1367  	RET
  1368  
  1369  	// 64 bytes at a time using ymm registers
  1370  hugeloop_avx2:
  1371  	CMPQ	BX, $64
  1372  	JB	bigloop_avx2
  1373  	VMOVDQU	(SI), Y0
  1374  	VMOVDQU	(DI), Y1
  1375  	VMOVDQU	32(SI), Y2
  1376  	VMOVDQU	32(DI), Y3
  1377  	VPCMPEQB	Y1, Y0, Y4
  1378  	VPCMPEQB	Y2, Y3, Y5
  1379  	VPAND	Y4, Y5, Y6
  1380  	VPMOVMSKB Y6, DX
  1381  	ADDQ	$64, SI
  1382  	ADDQ	$64, DI
  1383  	SUBQ	$64, BX
  1384  	CMPL	DX, $0xffffffff
  1385  	JEQ	hugeloop_avx2
  1386  	VZEROUPPER
  1387  	MOVB	$0, (AX)
  1388  	RET
  1389  
  1390  bigloop_avx2:
  1391  	VZEROUPPER
  1392  
  1393  	// 8 bytes at a time using 64-bit register
  1394  bigloop:
  1395  	CMPQ	BX, $8
  1396  	JBE	leftover
  1397  	MOVQ	(SI), CX
  1398  	MOVQ	(DI), DX
  1399  	ADDQ	$8, SI
  1400  	ADDQ	$8, DI
  1401  	SUBQ	$8, BX
  1402  	CMPQ	CX, DX
  1403  	JEQ	bigloop
  1404  	MOVB	$0, (AX)
  1405  	RET
  1406  
  1407  	// remaining 0-8 bytes
  1408  leftover:
  1409  	MOVQ	-8(SI)(BX*1), CX
  1410  	MOVQ	-8(DI)(BX*1), DX
  1411  	CMPQ	CX, DX
  1412  	SETEQ	(AX)
  1413  	RET
  1414  
  1415  small:
  1416  	CMPQ	BX, $0
  1417  	JEQ	equal
  1418  
  1419  	LEAQ	0(BX*8), CX
  1420  	NEGQ	CX
  1421  
  1422  	CMPB	SI, $0xf8
  1423  	JA	si_high
  1424  
  1425  	// load at SI won't cross a page boundary.
  1426  	MOVQ	(SI), SI
  1427  	JMP	si_finish
  1428  si_high:
  1429  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1430  	MOVQ	-8(SI)(BX*1), SI
  1431  	SHRQ	CX, SI
  1432  si_finish:
  1433  
  1434  	// same for DI.
  1435  	CMPB	DI, $0xf8
  1436  	JA	di_high
  1437  	MOVQ	(DI), DI
  1438  	JMP	di_finish
  1439  di_high:
  1440  	MOVQ	-8(DI)(BX*1), DI
  1441  	SHRQ	CX, DI
  1442  di_finish:
  1443  
  1444  	SUBQ	SI, DI
  1445  	SHLQ	CX, DI
  1446  equal:
  1447  	SETEQ	(AX)
  1448  	RET
  1449  
  1450  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1451  	MOVQ	s1_base+0(FP), SI
  1452  	MOVQ	s1_len+8(FP), BX
  1453  	MOVQ	s2_base+16(FP), DI
  1454  	MOVQ	s2_len+24(FP), DX
  1455  	LEAQ	ret+32(FP), R9
  1456  	JMP	runtime·cmpbody(SB)
  1457  
  1458  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1459  	MOVQ	s1+0(FP), SI
  1460  	MOVQ	s1+8(FP), BX
  1461  	MOVQ	s2+24(FP), DI
  1462  	MOVQ	s2+32(FP), DX
  1463  	LEAQ	res+48(FP), R9
  1464  	JMP	runtime·cmpbody(SB)
  1465  
  1466  // input:
  1467  //   SI = a
  1468  //   DI = b
  1469  //   BX = alen
  1470  //   DX = blen
  1471  //   R9 = address of output word (stores -1/0/1 here)
  1472  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1473  	CMPQ	SI, DI
  1474  	JEQ	allsame
  1475  	CMPQ	BX, DX
  1476  	MOVQ	DX, R8
  1477  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1478  	CMPQ	R8, $8
  1479  	JB	small
  1480  
  1481  	CMPQ	R8, $63
  1482  	JBE	loop
  1483  	CMPB    runtime·support_avx2(SB), $1
  1484  	JEQ     big_loop_avx2
  1485  	JMP	big_loop
  1486  loop:
  1487  	CMPQ	R8, $16
  1488  	JBE	_0through16
  1489  	MOVOU	(SI), X0
  1490  	MOVOU	(DI), X1
  1491  	PCMPEQB X0, X1
  1492  	PMOVMSKB X1, AX
  1493  	XORQ	$0xffff, AX	// convert EQ to NE
  1494  	JNE	diff16	// branch if at least one byte is not equal
  1495  	ADDQ	$16, SI
  1496  	ADDQ	$16, DI
  1497  	SUBQ	$16, R8
  1498  	JMP	loop
  1499  	
  1500  diff64:
  1501  	ADDQ	$48, SI
  1502  	ADDQ	$48, DI
  1503  	JMP	diff16
  1504  diff48:
  1505  	ADDQ	$32, SI
  1506  	ADDQ	$32, DI
  1507  	JMP	diff16
  1508  diff32:
  1509  	ADDQ	$16, SI
  1510  	ADDQ	$16, DI
  1511  	// AX = bit mask of differences
  1512  diff16:
  1513  	BSFQ	AX, BX	// index of first byte that differs
  1514  	XORQ	AX, AX
  1515  	MOVB	(SI)(BX*1), CX
  1516  	CMPB	CX, (DI)(BX*1)
  1517  	SETHI	AX
  1518  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1519  	MOVQ	AX, (R9)
  1520  	RET
  1521  
  1522  	// 0 through 16 bytes left, alen>=8, blen>=8
  1523  _0through16:
  1524  	CMPQ	R8, $8
  1525  	JBE	_0through8
  1526  	MOVQ	(SI), AX
  1527  	MOVQ	(DI), CX
  1528  	CMPQ	AX, CX
  1529  	JNE	diff8
  1530  _0through8:
  1531  	MOVQ	-8(SI)(R8*1), AX
  1532  	MOVQ	-8(DI)(R8*1), CX
  1533  	CMPQ	AX, CX
  1534  	JEQ	allsame
  1535  
  1536  	// AX and CX contain parts of a and b that differ.
  1537  diff8:
  1538  	BSWAPQ	AX	// reverse order of bytes
  1539  	BSWAPQ	CX
  1540  	XORQ	AX, CX
  1541  	BSRQ	CX, CX	// index of highest bit difference
  1542  	SHRQ	CX, AX	// move a's bit to bottom
  1543  	ANDQ	$1, AX	// mask bit
  1544  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1545  	MOVQ	AX, (R9)
  1546  	RET
  1547  
  1548  	// 0-7 bytes in common
  1549  small:
  1550  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1551  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1552  	JEQ	allsame
  1553  
  1554  	// load bytes of a into high bytes of AX
  1555  	CMPB	SI, $0xf8
  1556  	JA	si_high
  1557  	MOVQ	(SI), SI
  1558  	JMP	si_finish
  1559  si_high:
  1560  	MOVQ	-8(SI)(R8*1), SI
  1561  	SHRQ	CX, SI
  1562  si_finish:
  1563  	SHLQ	CX, SI
  1564  
  1565  	// load bytes of b in to high bytes of BX
  1566  	CMPB	DI, $0xf8
  1567  	JA	di_high
  1568  	MOVQ	(DI), DI
  1569  	JMP	di_finish
  1570  di_high:
  1571  	MOVQ	-8(DI)(R8*1), DI
  1572  	SHRQ	CX, DI
  1573  di_finish:
  1574  	SHLQ	CX, DI
  1575  
  1576  	BSWAPQ	SI	// reverse order of bytes
  1577  	BSWAPQ	DI
  1578  	XORQ	SI, DI	// find bit differences
  1579  	JEQ	allsame
  1580  	BSRQ	DI, CX	// index of highest bit difference
  1581  	SHRQ	CX, SI	// move a's bit to bottom
  1582  	ANDQ	$1, SI	// mask bit
  1583  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1584  	MOVQ	AX, (R9)
  1585  	RET
  1586  
  1587  allsame:
  1588  	XORQ	AX, AX
  1589  	XORQ	CX, CX
  1590  	CMPQ	BX, DX
  1591  	SETGT	AX	// 1 if alen > blen
  1592  	SETEQ	CX	// 1 if alen == blen
  1593  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1594  	MOVQ	AX, (R9)
  1595  	RET
  1596  
  1597  	// this works for >= 64 bytes of data.
  1598  big_loop:
  1599  	MOVOU	(SI), X0
  1600  	MOVOU	(DI), X1
  1601  	PCMPEQB X0, X1
  1602  	PMOVMSKB X1, AX
  1603  	XORQ	$0xffff, AX
  1604  	JNE	diff16
  1605  
  1606  	MOVOU	16(SI), X0
  1607  	MOVOU	16(DI), X1
  1608  	PCMPEQB X0, X1
  1609  	PMOVMSKB X1, AX
  1610  	XORQ	$0xffff, AX
  1611  	JNE	diff32
  1612  
  1613  	MOVOU	32(SI), X0
  1614  	MOVOU	32(DI), X1
  1615  	PCMPEQB X0, X1
  1616  	PMOVMSKB X1, AX
  1617  	XORQ	$0xffff, AX
  1618  	JNE	diff48
  1619  
  1620  	MOVOU	48(SI), X0
  1621  	MOVOU	48(DI), X1
  1622  	PCMPEQB X0, X1
  1623  	PMOVMSKB X1, AX
  1624  	XORQ	$0xffff, AX
  1625  	JNE	diff64
  1626  
  1627  	ADDQ	$64, SI
  1628  	ADDQ	$64, DI
  1629  	SUBQ	$64, R8
  1630  	CMPQ	R8, $64
  1631  	JBE	loop
  1632  	JMP	big_loop
  1633  
  1634  	// Compare 64-bytes per loop iteration.
  1635  	// Loop is unrolled and uses AVX2.
  1636  big_loop_avx2:
  1637  	VMOVDQU	(SI), Y2
  1638  	VMOVDQU	(DI), Y3
  1639  	VMOVDQU	32(SI), Y4
  1640  	VMOVDQU	32(DI), Y5
  1641  	VPCMPEQB Y2, Y3, Y0
  1642  	VPMOVMSKB Y0, AX
  1643  	XORL	$0xffffffff, AX
  1644  	JNE	diff32_avx2
  1645  	VPCMPEQB Y4, Y5, Y6
  1646  	VPMOVMSKB Y6, AX
  1647  	XORL	$0xffffffff, AX
  1648  	JNE	diff64_avx2
  1649  
  1650  	ADDQ	$64, SI
  1651  	ADDQ	$64, DI
  1652  	SUBQ	$64, R8
  1653  	CMPQ	R8, $64
  1654  	JB	big_loop_avx2_exit
  1655  	JMP	big_loop_avx2
  1656  
  1657  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1658  diff32_avx2:
  1659  	VZEROUPPER
  1660  	JMP diff16
  1661  
  1662  	// Same as diff32_avx2, but for last 32 bytes.
  1663  diff64_avx2:
  1664  	VZEROUPPER
  1665  	JMP diff48
  1666  
  1667  	// For <64 bytes remainder jump to normal loop.
  1668  big_loop_avx2_exit:
  1669  	VZEROUPPER
  1670  	JMP loop
  1671  
  1672  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1673  	MOVQ s+0(FP), DI
  1674  	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1675  	MOVQ s_len+8(FP), DX
  1676  	MOVQ c+16(FP), BP
  1677  	MOVQ c_len+24(FP), AX
  1678  	MOVQ DI, R10
  1679  	LEAQ ret+32(FP), R11
  1680  	JMP  runtime·indexShortStr(SB)
  1681  
  1682  TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
  1683  	MOVQ s+0(FP), DI
  1684  	MOVQ s_len+8(FP), DX
  1685  	MOVQ c+24(FP), BP
  1686  	MOVQ c_len+32(FP), AX
  1687  	MOVQ DI, R10
  1688  	LEAQ ret+48(FP), R11
  1689  	JMP  runtime·indexShortStr(SB)
  1690  
  1691  // AX: length of string, that we are searching for
  1692  // DX: length of string, in which we are searching
  1693  // DI: pointer to string, in which we are searching
  1694  // BP: pointer to string, that we are searching for
  1695  // R11: address, where to put return value
  1696  TEXT runtime·indexShortStr(SB),NOSPLIT,$0
  1697  	CMPQ AX, DX
  1698  	JA fail
  1699  	CMPQ DX, $16
  1700  	JAE sse42
  1701  no_sse42:
  1702  	CMPQ AX, $2
  1703  	JA   _3_or_more
  1704  	MOVW (BP), BP
  1705  	LEAQ -1(DI)(DX*1), DX
  1706  loop2:
  1707  	MOVW (DI), SI
  1708  	CMPW SI,BP
  1709  	JZ success
  1710  	ADDQ $1,DI
  1711  	CMPQ DI,DX
  1712  	JB loop2
  1713  	JMP fail
  1714  _3_or_more:
  1715  	CMPQ AX, $3
  1716  	JA   _4_or_more
  1717  	MOVW 1(BP), BX
  1718  	MOVW (BP), BP
  1719  	LEAQ -2(DI)(DX*1), DX
  1720  loop3:
  1721  	MOVW (DI), SI
  1722  	CMPW SI,BP
  1723  	JZ   partial_success3
  1724  	ADDQ $1,DI
  1725  	CMPQ DI,DX
  1726  	JB loop3
  1727  	JMP fail
  1728  partial_success3:
  1729  	MOVW 1(DI), SI
  1730  	CMPW SI,BX
  1731  	JZ success
  1732  	ADDQ $1,DI
  1733  	CMPQ DI,DX
  1734  	JB loop3
  1735  	JMP fail
  1736  _4_or_more:
  1737  	CMPQ AX, $4
  1738  	JA   _5_or_more
  1739  	MOVL (BP), BP
  1740  	LEAQ -3(DI)(DX*1), DX
  1741  loop4:
  1742  	MOVL (DI), SI
  1743  	CMPL SI,BP
  1744  	JZ   success
  1745  	ADDQ $1,DI
  1746  	CMPQ DI,DX
  1747  	JB loop4
  1748  	JMP fail
  1749  _5_or_more:
  1750  	CMPQ AX, $7
  1751  	JA   _8_or_more
  1752  	LEAQ 1(DI)(DX*1), DX
  1753  	SUBQ AX, DX
  1754  	MOVL -4(BP)(AX*1), BX
  1755  	MOVL (BP), BP
  1756  loop5to7:
  1757  	MOVL (DI), SI
  1758  	CMPL SI,BP
  1759  	JZ   partial_success5to7
  1760  	ADDQ $1,DI
  1761  	CMPQ DI,DX
  1762  	JB loop5to7
  1763  	JMP fail
  1764  partial_success5to7:
  1765  	MOVL -4(AX)(DI*1), SI
  1766  	CMPL SI,BX
  1767  	JZ success
  1768  	ADDQ $1,DI
  1769  	CMPQ DI,DX
  1770  	JB loop5to7
  1771  	JMP fail
  1772  _8_or_more:
  1773  	CMPQ AX, $8
  1774  	JA   _9_or_more
  1775  	MOVQ (BP), BP
  1776  	LEAQ -7(DI)(DX*1), DX
  1777  loop8:
  1778  	MOVQ (DI), SI
  1779  	CMPQ SI,BP
  1780  	JZ   success
  1781  	ADDQ $1,DI
  1782  	CMPQ DI,DX
  1783  	JB loop8
  1784  	JMP fail
  1785  _9_or_more:
  1786  	CMPQ AX, $15
  1787  	JA   _16_or_more
  1788  	LEAQ 1(DI)(DX*1), DX
  1789  	SUBQ AX, DX
  1790  	MOVQ -8(BP)(AX*1), BX
  1791  	MOVQ (BP), BP
  1792  loop9to15:
  1793  	MOVQ (DI), SI
  1794  	CMPQ SI,BP
  1795  	JZ   partial_success9to15
  1796  	ADDQ $1,DI
  1797  	CMPQ DI,DX
  1798  	JB loop9to15
  1799  	JMP fail
  1800  partial_success9to15:
  1801  	MOVQ -8(AX)(DI*1), SI
  1802  	CMPQ SI,BX
  1803  	JZ success
  1804  	ADDQ $1,DI
  1805  	CMPQ DI,DX
  1806  	JB loop9to15
  1807  	JMP fail
  1808  _16_or_more:
  1809  	CMPQ AX, $16
  1810  	JA   _17_or_more
  1811  	MOVOU (BP), X1
  1812  	LEAQ -15(DI)(DX*1), DX
  1813  loop16:
  1814  	MOVOU (DI), X2
  1815  	PCMPEQB X1, X2
  1816  	PMOVMSKB X2, SI
  1817  	CMPQ  SI, $0xffff
  1818  	JE   success
  1819  	ADDQ $1,DI
  1820  	CMPQ DI,DX
  1821  	JB loop16
  1822  	JMP fail
  1823  _17_or_more:
  1824  	CMPQ AX, $31
  1825  	JA   _32_or_more
  1826  	LEAQ 1(DI)(DX*1), DX
  1827  	SUBQ AX, DX
  1828  	MOVOU -16(BP)(AX*1), X0
  1829  	MOVOU (BP), X1
  1830  loop17to31:
  1831  	MOVOU (DI), X2
  1832  	PCMPEQB X1,X2
  1833  	PMOVMSKB X2, SI
  1834  	CMPQ  SI, $0xffff
  1835  	JE   partial_success17to31
  1836  	ADDQ $1,DI
  1837  	CMPQ DI,DX
  1838  	JB loop17to31
  1839  	JMP fail
  1840  partial_success17to31:
  1841  	MOVOU -16(AX)(DI*1), X3
  1842  	PCMPEQB X0, X3
  1843  	PMOVMSKB X3, SI
  1844  	CMPQ  SI, $0xffff
  1845  	JE success
  1846  	ADDQ $1,DI
  1847  	CMPQ DI,DX
  1848  	JB loop17to31
  1849  	JMP fail
  1850  // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
  1851  // So no need to check cpuid
  1852  _32_or_more:
  1853  	CMPQ AX, $32
  1854  	JA   _33_to_63
  1855  	VMOVDQU (BP), Y1
  1856  	LEAQ -31(DI)(DX*1), DX
  1857  loop32:
  1858  	VMOVDQU (DI), Y2
  1859  	VPCMPEQB Y1, Y2, Y3
  1860  	VPMOVMSKB Y3, SI
  1861  	CMPL  SI, $0xffffffff
  1862  	JE   success_avx2
  1863  	ADDQ $1,DI
  1864  	CMPQ DI,DX
  1865  	JB loop32
  1866  	JMP fail_avx2
  1867  _33_to_63:
  1868  	LEAQ 1(DI)(DX*1), DX
  1869  	SUBQ AX, DX
  1870  	VMOVDQU -32(BP)(AX*1), Y0
  1871  	VMOVDQU (BP), Y1
  1872  loop33to63:
  1873  	VMOVDQU (DI), Y2
  1874  	VPCMPEQB Y1, Y2, Y3
  1875  	VPMOVMSKB Y3, SI
  1876  	CMPL  SI, $0xffffffff
  1877  	JE   partial_success33to63
  1878  	ADDQ $1,DI
  1879  	CMPQ DI,DX
  1880  	JB loop33to63
  1881  	JMP fail_avx2
  1882  partial_success33to63:
  1883  	VMOVDQU -32(AX)(DI*1), Y3
  1884  	VPCMPEQB Y0, Y3, Y4
  1885  	VPMOVMSKB Y4, SI
  1886  	CMPL  SI, $0xffffffff
  1887  	JE success_avx2
  1888  	ADDQ $1,DI
  1889  	CMPQ DI,DX
  1890  	JB loop33to63
  1891  fail_avx2:
  1892  	VZEROUPPER
  1893  fail:
  1894  	MOVQ $-1, (R11)
  1895  	RET
  1896  success_avx2:
  1897  	VZEROUPPER
  1898  	JMP success
  1899  sse42:
  1900  	CMPB runtime·support_sse42(SB), $1
  1901  	JNE no_sse42
  1902  	CMPQ AX, $12
  1903  	// PCMPESTRI is slower than normal compare,
  1904  	// so using it makes sense only if we advance 4+ bytes per compare
  1905  	// This value was determined experimentally and is the ~same
  1906  	// on Nehalem (first with SSE42) and Haswell.
  1907  	JAE _9_or_more
  1908  	LEAQ 16(BP), SI
  1909  	TESTW $0xff0, SI
  1910  	JEQ no_sse42
  1911  	MOVOU (BP), X1
  1912  	LEAQ -15(DI)(DX*1), SI
  1913  	MOVQ $16, R9
  1914  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1915  loop_sse42:
  1916  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1917  	// for equality (bits 2,3 are 11)
  1918  	// result is not masked or inverted (bits 4,5 are 00)
  1919  	// and corresponds to first matching byte (bit 6 is 0)
  1920  	PCMPESTRI $0x0c, (DI), X1
  1921  	// CX == 16 means no match,
  1922  	// CX > R9 means partial match at the end of the string,
  1923  	// otherwise sep is at offset CX from X1 start
  1924  	CMPQ CX, R9
  1925  	JBE sse42_success
  1926  	ADDQ R9, DI
  1927  	CMPQ DI, SI
  1928  	JB loop_sse42
  1929  	PCMPESTRI $0x0c, -1(SI), X1
  1930  	CMPQ CX, R9
  1931  	JA fail
  1932  	LEAQ -1(SI), DI
  1933  sse42_success:
  1934  	ADDQ CX, DI
  1935  success:
  1936  	SUBQ R10, DI
  1937  	MOVQ DI, (R11)
  1938  	RET
  1939  
  1940  
  1941  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1942  	MOVQ s+0(FP), SI
  1943  	MOVQ s_len+8(FP), BX
  1944  	MOVB c+24(FP), AL
  1945  	LEAQ ret+32(FP), R8
  1946  	JMP  runtime·indexbytebody(SB)
  1947  
  1948  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1949  	MOVQ s+0(FP), SI
  1950  	MOVQ s_len+8(FP), BX
  1951  	MOVB c+16(FP), AL
  1952  	LEAQ ret+24(FP), R8
  1953  	JMP  runtime·indexbytebody(SB)
  1954  
  1955  // input:
  1956  //   SI: data
  1957  //   BX: data len
  1958  //   AL: byte sought
  1959  //   R8: address to put result
  1960  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1961  	// Shuffle X0 around so that each byte contains
  1962  	// the character we're looking for.
  1963  	MOVD AX, X0
  1964  	PUNPCKLBW X0, X0
  1965  	PUNPCKLBW X0, X0
  1966  	PSHUFL $0, X0, X0
  1967  	
  1968  	CMPQ BX, $16
  1969  	JLT small
  1970  
  1971  	MOVQ SI, DI
  1972  
  1973  	CMPQ BX, $32
  1974  	JA avx2
  1975  sse:
  1976  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  1977  	JMP	sseloopentry
  1978  	
  1979  sseloop:
  1980  	// Move the next 16-byte chunk of the data into X1.
  1981  	MOVOU	(DI), X1
  1982  	// Compare bytes in X0 to X1.
  1983  	PCMPEQB	X0, X1
  1984  	// Take the top bit of each byte in X1 and put the result in DX.
  1985  	PMOVMSKB X1, DX
  1986  	// Find first set bit, if any.
  1987  	BSFL	DX, DX
  1988  	JNZ	ssesuccess
  1989  	// Advance to next block.
  1990  	ADDQ	$16, DI
  1991  sseloopentry:
  1992  	CMPQ	DI, AX
  1993  	JB	sseloop
  1994  
  1995  	// Search the last 16-byte chunk. This chunk may overlap with the
  1996  	// chunks we've already searched, but that's ok.
  1997  	MOVQ	AX, DI
  1998  	MOVOU	(AX), X1
  1999  	PCMPEQB	X0, X1
  2000  	PMOVMSKB X1, DX
  2001  	BSFL	DX, DX
  2002  	JNZ	ssesuccess
  2003  
  2004  failure:
  2005  	MOVQ $-1, (R8)
  2006  	RET
  2007  
  2008  // We've found a chunk containing the byte.
  2009  // The chunk was loaded from DI.
  2010  // The index of the matching byte in the chunk is DX.
  2011  // The start of the data is SI.
  2012  ssesuccess:
  2013  	SUBQ SI, DI	// Compute offset of chunk within data.
  2014  	ADDQ DX, DI	// Add offset of byte within chunk.
  2015  	MOVQ DI, (R8)
  2016  	RET
  2017  
  2018  // handle for lengths < 16
  2019  small:
  2020  	TESTQ	BX, BX
  2021  	JEQ	failure
  2022  
  2023  	// Check if we'll load across a page boundary.
  2024  	LEAQ	16(SI), AX
  2025  	TESTW	$0xff0, AX
  2026  	JEQ	endofpage
  2027  
  2028  	MOVOU	(SI), X1 // Load data
  2029  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2030  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2031  	BSFL	DX, DX	// Find first set bit.
  2032  	JZ	failure	// No set bit, failure.
  2033  	CMPL	DX, BX
  2034  	JAE	failure	// Match is past end of data.
  2035  	MOVQ	DX, (R8)
  2036  	RET
  2037  
  2038  endofpage:
  2039  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2040  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2041  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2042  	MOVL	BX, CX
  2043  	SHLL	CX, DX
  2044  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2045  	BSFL	DX, DX	// Find first set bit.
  2046  	JZ	failure	// No set bit, failure.
  2047  	MOVQ	DX, (R8)
  2048  	RET
  2049  
  2050  avx2:
  2051  	CMPB   runtime·support_avx2(SB), $1
  2052  	JNE sse
  2053  	MOVD AX, X0
  2054  	LEAQ -32(SI)(BX*1), R11
  2055  	VPBROADCASTB  X0, Y1
  2056  avx2_loop:
  2057  	VMOVDQU (DI), Y2
  2058  	VPCMPEQB Y1, Y2, Y3
  2059  	VPTEST Y3, Y3
  2060  	JNZ avx2success
  2061  	ADDQ $32, DI
  2062  	CMPQ DI, R11
  2063  	JLT avx2_loop
  2064  	MOVQ R11, DI
  2065  	VMOVDQU (DI), Y2
  2066  	VPCMPEQB Y1, Y2, Y3
  2067  	VPTEST Y3, Y3
  2068  	JNZ avx2success
  2069  	VZEROUPPER
  2070  	MOVQ $-1, (R8)
  2071  	RET
  2072  
  2073  avx2success:
  2074  	VPMOVMSKB Y3, DX
  2075  	BSFL DX, DX
  2076  	SUBQ SI, DI
  2077  	ADDQ DI, DX
  2078  	MOVQ DX, (R8)
  2079  	VZEROUPPER
  2080  	RET
  2081  
  2082  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2083  	MOVQ	a_len+8(FP), BX
  2084  	MOVQ	b_len+32(FP), CX
  2085  	CMPQ	BX, CX
  2086  	JNE	eqret
  2087  	MOVQ	a+0(FP), SI
  2088  	MOVQ	b+24(FP), DI
  2089  	LEAQ	ret+48(FP), AX
  2090  	JMP	runtime·memeqbody(SB)
  2091  eqret:
  2092  	MOVB	$0, ret+48(FP)
  2093  	RET
  2094  
  2095  
  2096  TEXT bytes·countByte(SB),NOSPLIT,$0-40
  2097  	MOVQ s+0(FP), SI
  2098  	MOVQ s_len+8(FP), BX
  2099  	MOVB c+24(FP), AL
  2100  	LEAQ ret+32(FP), R8
  2101  	JMP  runtime·countByte(SB)
  2102  
  2103  TEXT strings·countByte(SB),NOSPLIT,$0-32
  2104  	MOVQ s+0(FP), SI
  2105  	MOVQ s_len+8(FP), BX
  2106  	MOVB c+16(FP), AL
  2107  	LEAQ ret+24(FP), R8
  2108  	JMP  runtime·countByte(SB)
  2109  
  2110  // input:
  2111  //   SI: data
  2112  //   BX: data len
  2113  //   AL: byte sought
  2114  //   R8: address to put result
  2115  // This requires the POPCNT instruction
  2116  TEXT runtime·countByte(SB),NOSPLIT,$0
  2117  	// Shuffle X0 around so that each byte contains
  2118  	// the character we're looking for.
  2119  	MOVD AX, X0
  2120  	PUNPCKLBW X0, X0
  2121  	PUNPCKLBW X0, X0
  2122  	PSHUFL $0, X0, X0
  2123  
  2124  	CMPQ BX, $16
  2125  	JLT small
  2126  
  2127  	MOVQ $0, R12 // Accumulator
  2128  
  2129  	MOVQ SI, DI
  2130  
  2131  	CMPQ BX, $32
  2132  	JA avx2
  2133  sse:
  2134  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2135  	JMP	sseloopentry
  2136  
  2137  sseloop:
  2138  	// Move the next 16-byte chunk of the data into X1.
  2139  	MOVOU	(DI), X1
  2140  	// Compare bytes in X0 to X1.
  2141  	PCMPEQB	X0, X1
  2142  	// Take the top bit of each byte in X1 and put the result in DX.
  2143  	PMOVMSKB X1, DX
  2144  	// Count number of matching bytes
  2145  	POPCNTL DX, DX
  2146  	// Accumulate into R12
  2147  	ADDQ DX, R12
  2148  	// Advance to next block.
  2149  	ADDQ	$16, DI
  2150  sseloopentry:
  2151  	CMPQ	DI, AX
  2152  	JBE	sseloop
  2153  
  2154  	// Get the number of bytes to consider in the last 16 bytes
  2155  	ANDQ $15, BX
  2156  	JZ end
  2157  
  2158  	// Create mask to ignore overlap between previous 16 byte block
  2159  	// and the next.
  2160  	MOVQ $16,CX
  2161  	SUBQ BX, CX
  2162  	MOVQ $0xFFFF, R10
  2163  	SARQ CL, R10
  2164  	SALQ CL, R10
  2165  
  2166  	// Process the last 16-byte chunk. This chunk may overlap with the
  2167  	// chunks we've already searched so we need to mask part of it.
  2168  	MOVOU	(AX), X1
  2169  	PCMPEQB	X0, X1
  2170  	PMOVMSKB X1, DX
  2171  	// Apply mask
  2172  	ANDQ R10, DX
  2173  	POPCNTL DX, DX
  2174  	ADDQ DX, R12
  2175  end:
  2176  	MOVQ R12, (R8)
  2177  	RET
  2178  
  2179  // handle for lengths < 16
  2180  small:
  2181  	TESTQ	BX, BX
  2182  	JEQ	endzero
  2183  
  2184  	// Check if we'll load across a page boundary.
  2185  	LEAQ	16(SI), AX
  2186  	TESTW	$0xff0, AX
  2187  	JEQ	endofpage
  2188  
  2189  	// We must ignore high bytes as they aren't part of our slice.
  2190  	// Create mask.
  2191  	MOVB BX, CX
  2192  	MOVQ $1, R10
  2193  	SALQ CL, R10
  2194  	SUBQ $1, R10
  2195  
  2196  	// Load data
  2197  	MOVOU	(SI), X1
  2198  	// Compare target byte with each byte in data.
  2199  	PCMPEQB	X0, X1
  2200  	// Move result bits to integer register.
  2201  	PMOVMSKB X1, DX
  2202  	// Apply mask
  2203  	ANDQ R10, DX
  2204  	POPCNTL DX, DX
  2205  	// Directly return DX, we don't need to accumulate
  2206  	// since we have <16 bytes.
  2207  	MOVQ	DX, (R8)
  2208  	RET
  2209  endzero:
  2210  	MOVQ $0, (R8)
  2211  	RET
  2212  
  2213  endofpage:
  2214  	// We must ignore low bytes as they aren't part of our slice.
  2215  	MOVQ $16,CX
  2216  	SUBQ BX, CX
  2217  	MOVQ $0xFFFF, R10
  2218  	SARQ CL, R10
  2219  	SALQ CL, R10
  2220  
  2221  	// Load data into the high end of X1.
  2222  	MOVOU	-16(SI)(BX*1), X1
  2223  	// Compare target byte with each byte in data.
  2224  	PCMPEQB	X0, X1
  2225  	// Move result bits to integer register.
  2226  	PMOVMSKB X1, DX
  2227  	// Apply mask
  2228  	ANDQ R10, DX
  2229  	// Directly return DX, we don't need to accumulate
  2230  	// since we have <16 bytes.
  2231  	POPCNTL DX, DX
  2232  	MOVQ	DX, (R8)
  2233  	RET
  2234  
  2235  avx2:
  2236  	CMPB   runtime·support_avx2(SB), $1
  2237  	JNE sse
  2238  	MOVD AX, X0
  2239  	LEAQ -32(SI)(BX*1), R11
  2240  	VPBROADCASTB  X0, Y1
  2241  avx2_loop:
  2242  	VMOVDQU (DI), Y2
  2243  	VPCMPEQB Y1, Y2, Y3
  2244  	VPMOVMSKB Y3, DX
  2245  	POPCNTL DX, DX
  2246  	ADDQ DX, R12
  2247  	ADDQ $32, DI
  2248  	CMPQ DI, R11
  2249  	JLE avx2_loop
  2250  
  2251  	// If last block is already processed,
  2252  	// skip to the end.
  2253  	CMPQ DI, R11
  2254  	JEQ endavx
  2255  
  2256  	// Load address of the last 32 bytes.
  2257  	// There is an overlap with the previous block.
  2258  	MOVQ R11, DI
  2259  	VMOVDQU (DI), Y2
  2260  	VPCMPEQB Y1, Y2, Y3
  2261  	VPMOVMSKB Y3, DX
  2262  	// Exit AVX mode.
  2263  	VZEROUPPER
  2264  
  2265  	// Create mask to ignore overlap between previous 32 byte block
  2266  	// and the next.
  2267  	ANDQ $31, BX
  2268  	MOVQ $32,CX
  2269  	SUBQ BX, CX
  2270  	MOVQ $0xFFFFFFFF, R10
  2271  	SARQ CL, R10
  2272  	SALQ CL, R10
  2273  	// Apply mask
  2274  	ANDQ R10, DX
  2275  	POPCNTL DX, DX
  2276  	ADDQ DX, R12
  2277  	MOVQ R12, (R8)
  2278  	RET
  2279  endavx:
  2280  	// Exit AVX mode.
  2281  	VZEROUPPER
  2282  	MOVQ R12, (R8)
  2283  	RET
  2284  
  2285  TEXT runtime·return0(SB), NOSPLIT, $0
  2286  	MOVL	$0, AX
  2287  	RET
  2288  
  2289  
  2290  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2291  // Must obey the gcc calling convention.
  2292  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2293  	get_tls(CX)
  2294  	MOVQ	g(CX), AX
  2295  	MOVQ	g_m(AX), AX
  2296  	MOVQ	m_curg(AX), AX
  2297  	MOVQ	(g_stack+stack_hi)(AX), AX
  2298  	RET
  2299  
  2300  // The top-most function running on a goroutine
  2301  // returns to goexit+PCQuantum.
  2302  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2303  	BYTE	$0x90	// NOP
  2304  	CALL	runtime·goexit1(SB)	// does not return
  2305  	// traceback from goexit1 must hit code range of goexit
  2306  	BYTE	$0x90	// NOP
  2307  
  2308  // This is called from .init_array and follows the platform, not Go, ABI.
  2309  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2310  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2311  	MOVQ	runtime·lastmoduledatap(SB), AX
  2312  	MOVQ	DI, moduledata_next(AX)
  2313  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2314  	POPQ	R15
  2315  	RET