github.com/karrick/go@v0.0.0-20170817181416-d5b0ec858b37/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVL	$0, AX
    30  	CPUID
    31  	MOVL	AX, SI
    32  	CMPL	AX, $0
    33  	JE	nocpuinfo
    34  
    35  	// Figure out how to serialize RDTSC.
    36  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37  	// Don't know about the rest, so let's do MFENCE.
    38  	CMPL	BX, $0x756E6547  // "Genu"
    39  	JNE	notintel
    40  	CMPL	DX, $0x49656E69  // "ineI"
    41  	JNE	notintel
    42  	CMPL	CX, $0x6C65746E  // "ntel"
    43  	JNE	notintel
    44  	MOVB	$1, runtime·isIntel(SB)
    45  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    46  notintel:
    47  
    48  	// Load EAX=1 cpuid flags
    49  	MOVL	$1, AX
    50  	CPUID
    51  	MOVL	AX, runtime·processorVersionInfo(SB)
    52  
    53  	TESTL	$(1<<26), DX // SSE2
    54  	SETNE	runtime·support_sse2(SB)
    55  
    56  	TESTL	$(1<<9), CX // SSSE3
    57  	SETNE	runtime·support_ssse3(SB)
    58  
    59  	TESTL	$(1<<19), CX // SSE4.1
    60  	SETNE	runtime·support_sse41(SB)
    61  
    62  	TESTL	$(1<<20), CX // SSE4.2
    63  	SETNE	runtime·support_sse42(SB)
    64  
    65  	TESTL	$(1<<23), CX // POPCNT
    66  	SETNE	runtime·support_popcnt(SB)
    67  
    68  	TESTL	$(1<<25), CX // AES
    69  	SETNE	runtime·support_aes(SB)
    70  
    71  	TESTL	$(1<<27), CX // OSXSAVE
    72  	SETNE	runtime·support_osxsave(SB)
    73  
    74  	// If OS support for XMM and YMM is not present
    75  	// support_avx will be set back to false later.
    76  	TESTL	$(1<<28), CX // AVX
    77  	SETNE	runtime·support_avx(SB)
    78  
    79  eax7:
    80  	// Load EAX=7/ECX=0 cpuid flags
    81  	CMPL	SI, $7
    82  	JLT	osavx
    83  	MOVL	$7, AX
    84  	MOVL	$0, CX
    85  	CPUID
    86  
    87  	TESTL	$(1<<3), BX // BMI1
    88  	SETNE	runtime·support_bmi1(SB)
    89  
    90  	// If OS support for XMM and YMM is not present
    91  	// support_avx2 will be set back to false later.
    92  	TESTL	$(1<<5), BX
    93  	SETNE	runtime·support_avx2(SB)
    94  
    95  	TESTL	$(1<<8), BX // BMI2
    96  	SETNE	runtime·support_bmi2(SB)
    97  
    98  	TESTL	$(1<<9), BX // ERMS
    99  	SETNE	runtime·support_erms(SB)
   100  
   101  osavx:
   102  	CMPB	runtime·support_osxsave(SB), $1
   103  	JNE	noavx
   104  	MOVL	$0, CX
   105  	// For XGETBV, OSXSAVE bit is required and sufficient
   106  	XGETBV
   107  	ANDL	$6, AX
   108  	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
   109  	JE nocpuinfo
   110  noavx:
   111  	MOVB $0, runtime·support_avx(SB)
   112  	MOVB $0, runtime·support_avx2(SB)
   113  
   114  nocpuinfo:
   115  	// if there is an _cgo_init, call it.
   116  	MOVQ	_cgo_init(SB), AX
   117  	TESTQ	AX, AX
   118  	JZ	needtls
   119  	// g0 already in DI
   120  	MOVQ	DI, CX	// Win64 uses CX for first parameter
   121  	MOVQ	$setg_gcc<>(SB), SI
   122  	CALL	AX
   123  
   124  	// update stackguard after _cgo_init
   125  	MOVQ	$runtime·g0(SB), CX
   126  	MOVQ	(g_stack+stack_lo)(CX), AX
   127  	ADDQ	$const__StackGuard, AX
   128  	MOVQ	AX, g_stackguard0(CX)
   129  	MOVQ	AX, g_stackguard1(CX)
   130  
   131  #ifndef GOOS_windows
   132  	JMP ok
   133  #endif
   134  needtls:
   135  #ifdef GOOS_plan9
   136  	// skip TLS setup on Plan 9
   137  	JMP ok
   138  #endif
   139  #ifdef GOOS_solaris
   140  	// skip TLS setup on Solaris
   141  	JMP ok
   142  #endif
   143  
   144  	LEAQ	runtime·m0+m_tls(SB), DI
   145  	CALL	runtime·settls(SB)
   146  
   147  	// store through it, to make sure it works
   148  	get_tls(BX)
   149  	MOVQ	$0x123, g(BX)
   150  	MOVQ	runtime·m0+m_tls(SB), AX
   151  	CMPQ	AX, $0x123
   152  	JEQ 2(PC)
   153  	MOVL	AX, 0	// abort
   154  ok:
   155  	// set the per-goroutine and per-mach "registers"
   156  	get_tls(BX)
   157  	LEAQ	runtime·g0(SB), CX
   158  	MOVQ	CX, g(BX)
   159  	LEAQ	runtime·m0(SB), AX
   160  
   161  	// save m->g0 = g0
   162  	MOVQ	CX, m_g0(AX)
   163  	// save m0 to g0->m
   164  	MOVQ	AX, g_m(CX)
   165  
   166  	CLD				// convention is D is always left cleared
   167  	CALL	runtime·check(SB)
   168  
   169  	MOVL	16(SP), AX		// copy argc
   170  	MOVL	AX, 0(SP)
   171  	MOVQ	24(SP), AX		// copy argv
   172  	MOVQ	AX, 8(SP)
   173  	CALL	runtime·args(SB)
   174  	CALL	runtime·osinit(SB)
   175  	CALL	runtime·schedinit(SB)
   176  
   177  	// create a new goroutine to start program
   178  	MOVQ	$runtime·mainPC(SB), AX		// entry
   179  	PUSHQ	AX
   180  	PUSHQ	$0			// arg size
   181  	CALL	runtime·newproc(SB)
   182  	POPQ	AX
   183  	POPQ	AX
   184  
   185  	// start this M
   186  	CALL	runtime·mstart(SB)
   187  
   188  	MOVL	$0xf1, 0xf1  // crash
   189  	RET
   190  
   191  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   192  GLOBL	runtime·mainPC(SB),RODATA,$8
   193  
   194  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   195  	BYTE	$0xcc
   196  	RET
   197  
   198  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   199  	// No per-thread init.
   200  	RET
   201  
   202  /*
   203   *  go-routine
   204   */
   205  
   206  // void gosave(Gobuf*)
   207  // save state in Gobuf; setjmp
   208  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   209  	MOVQ	buf+0(FP), AX		// gobuf
   210  	LEAQ	buf+0(FP), BX		// caller's SP
   211  	MOVQ	BX, gobuf_sp(AX)
   212  	MOVQ	0(SP), BX		// caller's PC
   213  	MOVQ	BX, gobuf_pc(AX)
   214  	MOVQ	$0, gobuf_ret(AX)
   215  	MOVQ	BP, gobuf_bp(AX)
   216  	// Assert ctxt is zero. See func save.
   217  	MOVQ	gobuf_ctxt(AX), BX
   218  	TESTQ	BX, BX
   219  	JZ	2(PC)
   220  	CALL	runtime·badctxt(SB)
   221  	get_tls(CX)
   222  	MOVQ	g(CX), BX
   223  	MOVQ	BX, gobuf_g(AX)
   224  	RET
   225  
   226  // void gogo(Gobuf*)
   227  // restore state from Gobuf; longjmp
   228  TEXT runtime·gogo(SB), NOSPLIT, $16-8
   229  	MOVQ	buf+0(FP), BX		// gobuf
   230  
   231  	// If ctxt is not nil, invoke deletion barrier before overwriting.
   232  	MOVQ	gobuf_ctxt(BX), AX
   233  	TESTQ	AX, AX
   234  	JZ	nilctxt
   235  	LEAQ	gobuf_ctxt(BX), AX
   236  	MOVQ	AX, 0(SP)
   237  	MOVQ	$0, 8(SP)
   238  	CALL	runtime·writebarrierptr_prewrite(SB)
   239  	MOVQ	buf+0(FP), BX
   240  
   241  nilctxt:
   242  	MOVQ	gobuf_g(BX), DX
   243  	MOVQ	0(DX), CX		// make sure g != nil
   244  	get_tls(CX)
   245  	MOVQ	DX, g(CX)
   246  	MOVQ	gobuf_sp(BX), SP	// restore SP
   247  	MOVQ	gobuf_ret(BX), AX
   248  	MOVQ	gobuf_ctxt(BX), DX
   249  	MOVQ	gobuf_bp(BX), BP
   250  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   251  	MOVQ	$0, gobuf_ret(BX)
   252  	MOVQ	$0, gobuf_ctxt(BX)
   253  	MOVQ	$0, gobuf_bp(BX)
   254  	MOVQ	gobuf_pc(BX), BX
   255  	JMP	BX
   256  
   257  // func mcall(fn func(*g))
   258  // Switch to m->g0's stack, call fn(g).
   259  // Fn must never return. It should gogo(&g->sched)
   260  // to keep running g.
   261  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   262  	MOVQ	fn+0(FP), DI
   263  	
   264  	get_tls(CX)
   265  	MOVQ	g(CX), AX	// save state in g->sched
   266  	MOVQ	0(SP), BX	// caller's PC
   267  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   268  	LEAQ	fn+0(FP), BX	// caller's SP
   269  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   270  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   271  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   272  
   273  	// switch to m->g0 & its stack, call fn
   274  	MOVQ	g(CX), BX
   275  	MOVQ	g_m(BX), BX
   276  	MOVQ	m_g0(BX), SI
   277  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   278  	JNE	3(PC)
   279  	MOVQ	$runtime·badmcall(SB), AX
   280  	JMP	AX
   281  	MOVQ	SI, g(CX)	// g = m->g0
   282  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   283  	PUSHQ	AX
   284  	MOVQ	DI, DX
   285  	MOVQ	0(DI), DI
   286  	CALL	DI
   287  	POPQ	AX
   288  	MOVQ	$runtime·badmcall2(SB), AX
   289  	JMP	AX
   290  	RET
   291  
   292  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   293  // of the G stack. We need to distinguish the routine that
   294  // lives at the bottom of the G stack from the one that lives
   295  // at the top of the system stack because the one at the top of
   296  // the system stack terminates the stack walk (see topofstack()).
   297  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   298  	RET
   299  
   300  // func systemstack(fn func())
   301  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   302  	MOVQ	fn+0(FP), DI	// DI = fn
   303  	get_tls(CX)
   304  	MOVQ	g(CX), AX	// AX = g
   305  	MOVQ	g_m(AX), BX	// BX = m
   306  
   307  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   308  	CMPQ	AX, DX
   309  	JEQ	noswitch
   310  
   311  	MOVQ	m_g0(BX), DX	// DX = g0
   312  	CMPQ	AX, DX
   313  	JEQ	noswitch
   314  
   315  	MOVQ	m_curg(BX), R8
   316  	CMPQ	AX, R8
   317  	JEQ	switch
   318  	
   319  	// Bad: g is not gsignal, not g0, not curg. What is it?
   320  	MOVQ	$runtime·badsystemstack(SB), AX
   321  	CALL	AX
   322  
   323  switch:
   324  	// save our state in g->sched. Pretend to
   325  	// be systemstack_switch if the G stack is scanned.
   326  	MOVQ	$runtime·systemstack_switch(SB), SI
   327  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   328  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   329  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   330  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   331  
   332  	// switch to g0
   333  	MOVQ	DX, g(CX)
   334  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   335  	// make it look like mstart called systemstack on g0, to stop traceback
   336  	SUBQ	$8, BX
   337  	MOVQ	$runtime·mstart(SB), DX
   338  	MOVQ	DX, 0(BX)
   339  	MOVQ	BX, SP
   340  
   341  	// call target function
   342  	MOVQ	DI, DX
   343  	MOVQ	0(DI), DI
   344  	CALL	DI
   345  
   346  	// switch back to g
   347  	get_tls(CX)
   348  	MOVQ	g(CX), AX
   349  	MOVQ	g_m(AX), BX
   350  	MOVQ	m_curg(BX), AX
   351  	MOVQ	AX, g(CX)
   352  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   353  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   354  	RET
   355  
   356  noswitch:
   357  	// already on m stack, just call directly
   358  	MOVQ	DI, DX
   359  	MOVQ	0(DI), DI
   360  	CALL	DI
   361  	RET
   362  
   363  /*
   364   * support for morestack
   365   */
   366  
   367  // Called during function prolog when more stack is needed.
   368  //
   369  // The traceback routines see morestack on a g0 as being
   370  // the top of a stack (for example, morestack calling newstack
   371  // calling the scheduler calling newm calling gc), so we must
   372  // record an argument size. For that purpose, it has no arguments.
   373  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   374  	// Cannot grow scheduler stack (m->g0).
   375  	get_tls(CX)
   376  	MOVQ	g(CX), BX
   377  	MOVQ	g_m(BX), BX
   378  	MOVQ	m_g0(BX), SI
   379  	CMPQ	g(CX), SI
   380  	JNE	3(PC)
   381  	CALL	runtime·badmorestackg0(SB)
   382  	INT	$3
   383  
   384  	// Cannot grow signal stack (m->gsignal).
   385  	MOVQ	m_gsignal(BX), SI
   386  	CMPQ	g(CX), SI
   387  	JNE	3(PC)
   388  	CALL	runtime·badmorestackgsignal(SB)
   389  	INT	$3
   390  
   391  	// Called from f.
   392  	// Set m->morebuf to f's caller.
   393  	MOVQ	8(SP), AX	// f's caller's PC
   394  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   395  	LEAQ	16(SP), AX	// f's caller's SP
   396  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   397  	get_tls(CX)
   398  	MOVQ	g(CX), SI
   399  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   400  
   401  	// Set g->sched to context in f.
   402  	MOVQ	0(SP), AX // f's PC
   403  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   404  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   405  	LEAQ	8(SP), AX // f's SP
   406  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   407  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   408  	// newstack will fill gobuf.ctxt.
   409  
   410  	// Call newstack on m->g0's stack.
   411  	MOVQ	m_g0(BX), BX
   412  	MOVQ	BX, g(CX)
   413  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   414  	PUSHQ	DX	// ctxt argument
   415  	CALL	runtime·newstack(SB)
   416  	MOVQ	$0, 0x1003	// crash if newstack returns
   417  	POPQ	DX	// keep balance check happy
   418  	RET
   419  
   420  // morestack but not preserving ctxt.
   421  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   422  	MOVL	$0, DX
   423  	JMP	runtime·morestack(SB)
   424  
   425  // reflectcall: call a function with the given argument list
   426  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   427  // we don't have variable-sized frames, so we use a small number
   428  // of constant-sized-frame functions to encode a few bits of size in the pc.
   429  // Caution: ugly multiline assembly macros in your future!
   430  
   431  #define DISPATCH(NAME,MAXSIZE)		\
   432  	CMPQ	CX, $MAXSIZE;		\
   433  	JA	3(PC);			\
   434  	MOVQ	$NAME(SB), AX;		\
   435  	JMP	AX
   436  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   437  
   438  TEXT reflect·call(SB), NOSPLIT, $0-0
   439  	JMP	·reflectcall(SB)
   440  
   441  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   442  	MOVLQZX argsize+24(FP), CX
   443  	DISPATCH(runtime·call32, 32)
   444  	DISPATCH(runtime·call64, 64)
   445  	DISPATCH(runtime·call128, 128)
   446  	DISPATCH(runtime·call256, 256)
   447  	DISPATCH(runtime·call512, 512)
   448  	DISPATCH(runtime·call1024, 1024)
   449  	DISPATCH(runtime·call2048, 2048)
   450  	DISPATCH(runtime·call4096, 4096)
   451  	DISPATCH(runtime·call8192, 8192)
   452  	DISPATCH(runtime·call16384, 16384)
   453  	DISPATCH(runtime·call32768, 32768)
   454  	DISPATCH(runtime·call65536, 65536)
   455  	DISPATCH(runtime·call131072, 131072)
   456  	DISPATCH(runtime·call262144, 262144)
   457  	DISPATCH(runtime·call524288, 524288)
   458  	DISPATCH(runtime·call1048576, 1048576)
   459  	DISPATCH(runtime·call2097152, 2097152)
   460  	DISPATCH(runtime·call4194304, 4194304)
   461  	DISPATCH(runtime·call8388608, 8388608)
   462  	DISPATCH(runtime·call16777216, 16777216)
   463  	DISPATCH(runtime·call33554432, 33554432)
   464  	DISPATCH(runtime·call67108864, 67108864)
   465  	DISPATCH(runtime·call134217728, 134217728)
   466  	DISPATCH(runtime·call268435456, 268435456)
   467  	DISPATCH(runtime·call536870912, 536870912)
   468  	DISPATCH(runtime·call1073741824, 1073741824)
   469  	MOVQ	$runtime·badreflectcall(SB), AX
   470  	JMP	AX
   471  
   472  #define CALLFN(NAME,MAXSIZE)			\
   473  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   474  	NO_LOCAL_POINTERS;			\
   475  	/* copy arguments to stack */		\
   476  	MOVQ	argptr+16(FP), SI;		\
   477  	MOVLQZX argsize+24(FP), CX;		\
   478  	MOVQ	SP, DI;				\
   479  	REP;MOVSB;				\
   480  	/* call function */			\
   481  	MOVQ	f+8(FP), DX;			\
   482  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   483  	CALL	(DX);				\
   484  	/* copy return values back */		\
   485  	MOVQ	argtype+0(FP), DX;		\
   486  	MOVQ	argptr+16(FP), DI;		\
   487  	MOVLQZX	argsize+24(FP), CX;		\
   488  	MOVLQZX	retoffset+28(FP), BX;		\
   489  	MOVQ	SP, SI;				\
   490  	ADDQ	BX, DI;				\
   491  	ADDQ	BX, SI;				\
   492  	SUBQ	BX, CX;				\
   493  	CALL	callRet<>(SB);			\
   494  	RET
   495  
   496  // callRet copies return values back at the end of call*. This is a
   497  // separate function so it can allocate stack space for the arguments
   498  // to reflectcallmove. It does not follow the Go ABI; it expects its
   499  // arguments in registers.
   500  TEXT callRet<>(SB), NOSPLIT, $32-0
   501  	NO_LOCAL_POINTERS
   502  	MOVQ	DX, 0(SP)
   503  	MOVQ	DI, 8(SP)
   504  	MOVQ	SI, 16(SP)
   505  	MOVQ	CX, 24(SP)
   506  	CALL	runtime·reflectcallmove(SB)
   507  	RET
   508  
   509  CALLFN(·call32, 32)
   510  CALLFN(·call64, 64)
   511  CALLFN(·call128, 128)
   512  CALLFN(·call256, 256)
   513  CALLFN(·call512, 512)
   514  CALLFN(·call1024, 1024)
   515  CALLFN(·call2048, 2048)
   516  CALLFN(·call4096, 4096)
   517  CALLFN(·call8192, 8192)
   518  CALLFN(·call16384, 16384)
   519  CALLFN(·call32768, 32768)
   520  CALLFN(·call65536, 65536)
   521  CALLFN(·call131072, 131072)
   522  CALLFN(·call262144, 262144)
   523  CALLFN(·call524288, 524288)
   524  CALLFN(·call1048576, 1048576)
   525  CALLFN(·call2097152, 2097152)
   526  CALLFN(·call4194304, 4194304)
   527  CALLFN(·call8388608, 8388608)
   528  CALLFN(·call16777216, 16777216)
   529  CALLFN(·call33554432, 33554432)
   530  CALLFN(·call67108864, 67108864)
   531  CALLFN(·call134217728, 134217728)
   532  CALLFN(·call268435456, 268435456)
   533  CALLFN(·call536870912, 536870912)
   534  CALLFN(·call1073741824, 1073741824)
   535  
   536  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   537  	MOVL	cycles+0(FP), AX
   538  again:
   539  	PAUSE
   540  	SUBL	$1, AX
   541  	JNZ	again
   542  	RET
   543  
   544  
   545  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   546  	// Stores are already ordered on x86, so this is just a
   547  	// compile barrier.
   548  	RET
   549  
   550  // void jmpdefer(fn, sp);
   551  // called from deferreturn.
   552  // 1. pop the caller
   553  // 2. sub 5 bytes from the callers return
   554  // 3. jmp to the argument
   555  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   556  	MOVQ	fv+0(FP), DX	// fn
   557  	MOVQ	argp+8(FP), BX	// caller sp
   558  	LEAQ	-8(BX), SP	// caller sp after CALL
   559  	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   560  	SUBQ	$5, (SP)	// return to CALL again
   561  	MOVQ	0(DX), BX
   562  	JMP	BX	// but first run the deferred function
   563  
   564  // Save state of caller into g->sched. Smashes R8, R9.
   565  TEXT gosave<>(SB),NOSPLIT,$0
   566  	get_tls(R8)
   567  	MOVQ	g(R8), R8
   568  	MOVQ	0(SP), R9
   569  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   570  	LEAQ	8(SP), R9
   571  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   572  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   573  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   574  	// Assert ctxt is zero. See func save.
   575  	MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   576  	TESTQ	R9, R9
   577  	JZ	2(PC)
   578  	CALL	runtime·badctxt(SB)
   579  	RET
   580  
   581  // func asmcgocall(fn, arg unsafe.Pointer) int32
   582  // Call fn(arg) on the scheduler stack,
   583  // aligned appropriately for the gcc ABI.
   584  // See cgocall.go for more details.
   585  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   586  	MOVQ	fn+0(FP), AX
   587  	MOVQ	arg+8(FP), BX
   588  
   589  	MOVQ	SP, DX
   590  
   591  	// Figure out if we need to switch to m->g0 stack.
   592  	// We get called to create new OS threads too, and those
   593  	// come in on the m->g0 stack already.
   594  	get_tls(CX)
   595  	MOVQ	g(CX), R8
   596  	CMPQ	R8, $0
   597  	JEQ	nosave
   598  	MOVQ	g_m(R8), R8
   599  	MOVQ	m_g0(R8), SI
   600  	MOVQ	g(CX), DI
   601  	CMPQ	SI, DI
   602  	JEQ	nosave
   603  	MOVQ	m_gsignal(R8), SI
   604  	CMPQ	SI, DI
   605  	JEQ	nosave
   606  	
   607  	// Switch to system stack.
   608  	MOVQ	m_g0(R8), SI
   609  	CALL	gosave<>(SB)
   610  	MOVQ	SI, g(CX)
   611  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   612  
   613  	// Now on a scheduling stack (a pthread-created stack).
   614  	// Make sure we have enough room for 4 stack-backed fast-call
   615  	// registers as per windows amd64 calling convention.
   616  	SUBQ	$64, SP
   617  	ANDQ	$~15, SP	// alignment for gcc ABI
   618  	MOVQ	DI, 48(SP)	// save g
   619  	MOVQ	(g_stack+stack_hi)(DI), DI
   620  	SUBQ	DX, DI
   621  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   622  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   623  	MOVQ	BX, CX		// CX = first argument in Win64
   624  	CALL	AX
   625  
   626  	// Restore registers, g, stack pointer.
   627  	get_tls(CX)
   628  	MOVQ	48(SP), DI
   629  	MOVQ	(g_stack+stack_hi)(DI), SI
   630  	SUBQ	40(SP), SI
   631  	MOVQ	DI, g(CX)
   632  	MOVQ	SI, SP
   633  
   634  	MOVL	AX, ret+16(FP)
   635  	RET
   636  
   637  nosave:
   638  	// Running on a system stack, perhaps even without a g.
   639  	// Having no g can happen during thread creation or thread teardown
   640  	// (see needm/dropm on Solaris, for example).
   641  	// This code is like the above sequence but without saving/restoring g
   642  	// and without worrying about the stack moving out from under us
   643  	// (because we're on a system stack, not a goroutine stack).
   644  	// The above code could be used directly if already on a system stack,
   645  	// but then the only path through this code would be a rare case on Solaris.
   646  	// Using this code for all "already on system stack" calls exercises it more,
   647  	// which should help keep it correct.
   648  	SUBQ	$64, SP
   649  	ANDQ	$~15, SP
   650  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   651  	MOVQ	DX, 40(SP)	// save original stack pointer
   652  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   653  	MOVQ	BX, CX		// CX = first argument in Win64
   654  	CALL	AX
   655  	MOVQ	40(SP), SI	// restore original stack pointer
   656  	MOVQ	SI, SP
   657  	MOVL	AX, ret+16(FP)
   658  	RET
   659  
   660  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   661  // Turn the fn into a Go func (by taking its address) and call
   662  // cgocallback_gofunc.
   663  TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   664  	LEAQ	fn+0(FP), AX
   665  	MOVQ	AX, 0(SP)
   666  	MOVQ	frame+8(FP), AX
   667  	MOVQ	AX, 8(SP)
   668  	MOVQ	framesize+16(FP), AX
   669  	MOVQ	AX, 16(SP)
   670  	MOVQ	ctxt+24(FP), AX
   671  	MOVQ	AX, 24(SP)
   672  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   673  	CALL	AX
   674  	RET
   675  
   676  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   677  // See cgocall.go for more details.
   678  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   679  	NO_LOCAL_POINTERS
   680  
   681  	// If g is nil, Go did not create the current thread.
   682  	// Call needm to obtain one m for temporary use.
   683  	// In this case, we're running on the thread stack, so there's
   684  	// lots of space, but the linker doesn't know. Hide the call from
   685  	// the linker analysis by using an indirect call through AX.
   686  	get_tls(CX)
   687  #ifdef GOOS_windows
   688  	MOVL	$0, BX
   689  	CMPQ	CX, $0
   690  	JEQ	2(PC)
   691  #endif
   692  	MOVQ	g(CX), BX
   693  	CMPQ	BX, $0
   694  	JEQ	needm
   695  	MOVQ	g_m(BX), BX
   696  	MOVQ	BX, R8 // holds oldm until end of function
   697  	JMP	havem
   698  needm:
   699  	MOVQ	$0, 0(SP)
   700  	MOVQ	$runtime·needm(SB), AX
   701  	CALL	AX
   702  	MOVQ	0(SP), R8
   703  	get_tls(CX)
   704  	MOVQ	g(CX), BX
   705  	MOVQ	g_m(BX), BX
   706  	
   707  	// Set m->sched.sp = SP, so that if a panic happens
   708  	// during the function we are about to execute, it will
   709  	// have a valid SP to run on the g0 stack.
   710  	// The next few lines (after the havem label)
   711  	// will save this SP onto the stack and then write
   712  	// the same SP back to m->sched.sp. That seems redundant,
   713  	// but if an unrecovered panic happens, unwindm will
   714  	// restore the g->sched.sp from the stack location
   715  	// and then systemstack will try to use it. If we don't set it here,
   716  	// that restored SP will be uninitialized (typically 0) and
   717  	// will not be usable.
   718  	MOVQ	m_g0(BX), SI
   719  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   720  
   721  havem:
   722  	// Now there's a valid m, and we're running on its m->g0.
   723  	// Save current m->g0->sched.sp on stack and then set it to SP.
   724  	// Save current sp in m->g0->sched.sp in preparation for
   725  	// switch back to m->curg stack.
   726  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   727  	MOVQ	m_g0(BX), SI
   728  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   729  	MOVQ	AX, 0(SP)
   730  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   731  
   732  	// Switch to m->curg stack and call runtime.cgocallbackg.
   733  	// Because we are taking over the execution of m->curg
   734  	// but *not* resuming what had been running, we need to
   735  	// save that information (m->curg->sched) so we can restore it.
   736  	// We can restore m->curg->sched.sp easily, because calling
   737  	// runtime.cgocallbackg leaves SP unchanged upon return.
   738  	// To save m->curg->sched.pc, we push it onto the stack.
   739  	// This has the added benefit that it looks to the traceback
   740  	// routine like cgocallbackg is going to return to that
   741  	// PC (because the frame we allocate below has the same
   742  	// size as cgocallback_gofunc's frame declared above)
   743  	// so that the traceback will seamlessly trace back into
   744  	// the earlier calls.
   745  	//
   746  	// In the new goroutine, 8(SP) holds the saved R8.
   747  	MOVQ	m_curg(BX), SI
   748  	MOVQ	SI, g(CX)
   749  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   750  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   751  	MOVQ	BX, -8(DI)
   752  	// Compute the size of the frame, including return PC and, if
   753  	// GOEXPERIMENT=framepointer, the saved base pointer
   754  	MOVQ	ctxt+24(FP), BX
   755  	LEAQ	fv+0(FP), AX
   756  	SUBQ	SP, AX
   757  	SUBQ	AX, DI
   758  	MOVQ	DI, SP
   759  
   760  	MOVQ	R8, 8(SP)
   761  	MOVQ	BX, 0(SP)
   762  	CALL	runtime·cgocallbackg(SB)
   763  	MOVQ	8(SP), R8
   764  
   765  	// Compute the size of the frame again. FP and SP have
   766  	// completely different values here than they did above,
   767  	// but only their difference matters.
   768  	LEAQ	fv+0(FP), AX
   769  	SUBQ	SP, AX
   770  
   771  	// Restore g->sched (== m->curg->sched) from saved values.
   772  	get_tls(CX)
   773  	MOVQ	g(CX), SI
   774  	MOVQ	SP, DI
   775  	ADDQ	AX, DI
   776  	MOVQ	-8(DI), BX
   777  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   778  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   779  
   780  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   781  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   782  	// so we do not have to restore it.)
   783  	MOVQ	g(CX), BX
   784  	MOVQ	g_m(BX), BX
   785  	MOVQ	m_g0(BX), SI
   786  	MOVQ	SI, g(CX)
   787  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   788  	MOVQ	0(SP), AX
   789  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   790  	
   791  	// If the m on entry was nil, we called needm above to borrow an m
   792  	// for the duration of the call. Since the call is over, return it with dropm.
   793  	CMPQ	R8, $0
   794  	JNE 3(PC)
   795  	MOVQ	$runtime·dropm(SB), AX
   796  	CALL	AX
   797  
   798  	// Done!
   799  	RET
   800  
   801  // void setg(G*); set g. for use by needm.
   802  TEXT runtime·setg(SB), NOSPLIT, $0-8
   803  	MOVQ	gg+0(FP), BX
   804  #ifdef GOOS_windows
   805  	CMPQ	BX, $0
   806  	JNE	settls
   807  	MOVQ	$0, 0x28(GS)
   808  	RET
   809  settls:
   810  	MOVQ	g_m(BX), AX
   811  	LEAQ	m_tls(AX), AX
   812  	MOVQ	AX, 0x28(GS)
   813  #endif
   814  	get_tls(CX)
   815  	MOVQ	BX, g(CX)
   816  	RET
   817  
   818  // void setg_gcc(G*); set g called from gcc.
   819  TEXT setg_gcc<>(SB),NOSPLIT,$0
   820  	get_tls(AX)
   821  	MOVQ	DI, g(AX)
   822  	RET
   823  
   824  // check that SP is in range [g->stack.lo, g->stack.hi)
   825  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   826  	get_tls(CX)
   827  	MOVQ	g(CX), AX
   828  	CMPQ	(g_stack+stack_hi)(AX), SP
   829  	JHI	2(PC)
   830  	INT	$3
   831  	CMPQ	SP, (g_stack+stack_lo)(AX)
   832  	JHI	2(PC)
   833  	INT	$3
   834  	RET
   835  
   836  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   837  	MOVQ	argp+0(FP),AX		// addr of first arg
   838  	MOVQ	-8(AX),AX		// get calling pc
   839  	MOVQ	AX, ret+8(FP)
   840  	RET
   841  
   842  // func cputicks() int64
   843  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   844  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   845  	JNE	mfence
   846  	LFENCE
   847  	JMP	done
   848  mfence:
   849  	MFENCE
   850  done:
   851  	RDTSC
   852  	SHLQ	$32, DX
   853  	ADDQ	DX, AX
   854  	MOVQ	AX, ret+0(FP)
   855  	RET
   856  
   857  // hash function using AES hardware instructions
   858  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   859  	MOVQ	p+0(FP), AX	// ptr to data
   860  	MOVQ	s+16(FP), CX	// size
   861  	LEAQ	ret+24(FP), DX
   862  	JMP	runtime·aeshashbody(SB)
   863  
   864  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   865  	MOVQ	p+0(FP), AX	// ptr to string struct
   866  	MOVQ	8(AX), CX	// length of string
   867  	MOVQ	(AX), AX	// string data
   868  	LEAQ	ret+16(FP), DX
   869  	JMP	runtime·aeshashbody(SB)
   870  
   871  // AX: data
   872  // CX: length
   873  // DX: address to put return value
   874  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   875  	// Fill an SSE register with our seeds.
   876  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   877  	PINSRW	$4, CX, X0			// 16 bits of length
   878  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   879  	MOVO	X0, X1				// save unscrambled seed
   880  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   881  	AESENC	X0, X0				// scramble seed
   882  
   883  	CMPQ	CX, $16
   884  	JB	aes0to15
   885  	JE	aes16
   886  	CMPQ	CX, $32
   887  	JBE	aes17to32
   888  	CMPQ	CX, $64
   889  	JBE	aes33to64
   890  	CMPQ	CX, $128
   891  	JBE	aes65to128
   892  	JMP	aes129plus
   893  
   894  aes0to15:
   895  	TESTQ	CX, CX
   896  	JE	aes0
   897  
   898  	ADDQ	$16, AX
   899  	TESTW	$0xff0, AX
   900  	JE	endofpage
   901  
   902  	// 16 bytes loaded at this address won't cross
   903  	// a page boundary, so we can load it directly.
   904  	MOVOU	-16(AX), X1
   905  	ADDQ	CX, CX
   906  	MOVQ	$masks<>(SB), AX
   907  	PAND	(AX)(CX*8), X1
   908  final1:
   909  	PXOR	X0, X1	// xor data with seed
   910  	AESENC	X1, X1	// scramble combo 3 times
   911  	AESENC	X1, X1
   912  	AESENC	X1, X1
   913  	MOVQ	X1, (DX)
   914  	RET
   915  
   916  endofpage:
   917  	// address ends in 1111xxxx. Might be up against
   918  	// a page boundary, so load ending at last byte.
   919  	// Then shift bytes down using pshufb.
   920  	MOVOU	-32(AX)(CX*1), X1
   921  	ADDQ	CX, CX
   922  	MOVQ	$shifts<>(SB), AX
   923  	PSHUFB	(AX)(CX*8), X1
   924  	JMP	final1
   925  
   926  aes0:
   927  	// Return scrambled input seed
   928  	AESENC	X0, X0
   929  	MOVQ	X0, (DX)
   930  	RET
   931  
   932  aes16:
   933  	MOVOU	(AX), X1
   934  	JMP	final1
   935  
   936  aes17to32:
   937  	// make second starting seed
   938  	PXOR	runtime·aeskeysched+16(SB), X1
   939  	AESENC	X1, X1
   940  	
   941  	// load data to be hashed
   942  	MOVOU	(AX), X2
   943  	MOVOU	-16(AX)(CX*1), X3
   944  
   945  	// xor with seed
   946  	PXOR	X0, X2
   947  	PXOR	X1, X3
   948  
   949  	// scramble 3 times
   950  	AESENC	X2, X2
   951  	AESENC	X3, X3
   952  	AESENC	X2, X2
   953  	AESENC	X3, X3
   954  	AESENC	X2, X2
   955  	AESENC	X3, X3
   956  
   957  	// combine results
   958  	PXOR	X3, X2
   959  	MOVQ	X2, (DX)
   960  	RET
   961  
   962  aes33to64:
   963  	// make 3 more starting seeds
   964  	MOVO	X1, X2
   965  	MOVO	X1, X3
   966  	PXOR	runtime·aeskeysched+16(SB), X1
   967  	PXOR	runtime·aeskeysched+32(SB), X2
   968  	PXOR	runtime·aeskeysched+48(SB), X3
   969  	AESENC	X1, X1
   970  	AESENC	X2, X2
   971  	AESENC	X3, X3
   972  	
   973  	MOVOU	(AX), X4
   974  	MOVOU	16(AX), X5
   975  	MOVOU	-32(AX)(CX*1), X6
   976  	MOVOU	-16(AX)(CX*1), X7
   977  
   978  	PXOR	X0, X4
   979  	PXOR	X1, X5
   980  	PXOR	X2, X6
   981  	PXOR	X3, X7
   982  	
   983  	AESENC	X4, X4
   984  	AESENC	X5, X5
   985  	AESENC	X6, X6
   986  	AESENC	X7, X7
   987  	
   988  	AESENC	X4, X4
   989  	AESENC	X5, X5
   990  	AESENC	X6, X6
   991  	AESENC	X7, X7
   992  	
   993  	AESENC	X4, X4
   994  	AESENC	X5, X5
   995  	AESENC	X6, X6
   996  	AESENC	X7, X7
   997  
   998  	PXOR	X6, X4
   999  	PXOR	X7, X5
  1000  	PXOR	X5, X4
  1001  	MOVQ	X4, (DX)
  1002  	RET
  1003  
  1004  aes65to128:
  1005  	// make 7 more starting seeds
  1006  	MOVO	X1, X2
  1007  	MOVO	X1, X3
  1008  	MOVO	X1, X4
  1009  	MOVO	X1, X5
  1010  	MOVO	X1, X6
  1011  	MOVO	X1, X7
  1012  	PXOR	runtime·aeskeysched+16(SB), X1
  1013  	PXOR	runtime·aeskeysched+32(SB), X2
  1014  	PXOR	runtime·aeskeysched+48(SB), X3
  1015  	PXOR	runtime·aeskeysched+64(SB), X4
  1016  	PXOR	runtime·aeskeysched+80(SB), X5
  1017  	PXOR	runtime·aeskeysched+96(SB), X6
  1018  	PXOR	runtime·aeskeysched+112(SB), X7
  1019  	AESENC	X1, X1
  1020  	AESENC	X2, X2
  1021  	AESENC	X3, X3
  1022  	AESENC	X4, X4
  1023  	AESENC	X5, X5
  1024  	AESENC	X6, X6
  1025  	AESENC	X7, X7
  1026  
  1027  	// load data
  1028  	MOVOU	(AX), X8
  1029  	MOVOU	16(AX), X9
  1030  	MOVOU	32(AX), X10
  1031  	MOVOU	48(AX), X11
  1032  	MOVOU	-64(AX)(CX*1), X12
  1033  	MOVOU	-48(AX)(CX*1), X13
  1034  	MOVOU	-32(AX)(CX*1), X14
  1035  	MOVOU	-16(AX)(CX*1), X15
  1036  
  1037  	// xor with seed
  1038  	PXOR	X0, X8
  1039  	PXOR	X1, X9
  1040  	PXOR	X2, X10
  1041  	PXOR	X3, X11
  1042  	PXOR	X4, X12
  1043  	PXOR	X5, X13
  1044  	PXOR	X6, X14
  1045  	PXOR	X7, X15
  1046  
  1047  	// scramble 3 times
  1048  	AESENC	X8, X8
  1049  	AESENC	X9, X9
  1050  	AESENC	X10, X10
  1051  	AESENC	X11, X11
  1052  	AESENC	X12, X12
  1053  	AESENC	X13, X13
  1054  	AESENC	X14, X14
  1055  	AESENC	X15, X15
  1056  
  1057  	AESENC	X8, X8
  1058  	AESENC	X9, X9
  1059  	AESENC	X10, X10
  1060  	AESENC	X11, X11
  1061  	AESENC	X12, X12
  1062  	AESENC	X13, X13
  1063  	AESENC	X14, X14
  1064  	AESENC	X15, X15
  1065  
  1066  	AESENC	X8, X8
  1067  	AESENC	X9, X9
  1068  	AESENC	X10, X10
  1069  	AESENC	X11, X11
  1070  	AESENC	X12, X12
  1071  	AESENC	X13, X13
  1072  	AESENC	X14, X14
  1073  	AESENC	X15, X15
  1074  
  1075  	// combine results
  1076  	PXOR	X12, X8
  1077  	PXOR	X13, X9
  1078  	PXOR	X14, X10
  1079  	PXOR	X15, X11
  1080  	PXOR	X10, X8
  1081  	PXOR	X11, X9
  1082  	PXOR	X9, X8
  1083  	MOVQ	X8, (DX)
  1084  	RET
  1085  
  1086  aes129plus:
  1087  	// make 7 more starting seeds
  1088  	MOVO	X1, X2
  1089  	MOVO	X1, X3
  1090  	MOVO	X1, X4
  1091  	MOVO	X1, X5
  1092  	MOVO	X1, X6
  1093  	MOVO	X1, X7
  1094  	PXOR	runtime·aeskeysched+16(SB), X1
  1095  	PXOR	runtime·aeskeysched+32(SB), X2
  1096  	PXOR	runtime·aeskeysched+48(SB), X3
  1097  	PXOR	runtime·aeskeysched+64(SB), X4
  1098  	PXOR	runtime·aeskeysched+80(SB), X5
  1099  	PXOR	runtime·aeskeysched+96(SB), X6
  1100  	PXOR	runtime·aeskeysched+112(SB), X7
  1101  	AESENC	X1, X1
  1102  	AESENC	X2, X2
  1103  	AESENC	X3, X3
  1104  	AESENC	X4, X4
  1105  	AESENC	X5, X5
  1106  	AESENC	X6, X6
  1107  	AESENC	X7, X7
  1108  	
  1109  	// start with last (possibly overlapping) block
  1110  	MOVOU	-128(AX)(CX*1), X8
  1111  	MOVOU	-112(AX)(CX*1), X9
  1112  	MOVOU	-96(AX)(CX*1), X10
  1113  	MOVOU	-80(AX)(CX*1), X11
  1114  	MOVOU	-64(AX)(CX*1), X12
  1115  	MOVOU	-48(AX)(CX*1), X13
  1116  	MOVOU	-32(AX)(CX*1), X14
  1117  	MOVOU	-16(AX)(CX*1), X15
  1118  
  1119  	// xor in seed
  1120  	PXOR	X0, X8
  1121  	PXOR	X1, X9
  1122  	PXOR	X2, X10
  1123  	PXOR	X3, X11
  1124  	PXOR	X4, X12
  1125  	PXOR	X5, X13
  1126  	PXOR	X6, X14
  1127  	PXOR	X7, X15
  1128  	
  1129  	// compute number of remaining 128-byte blocks
  1130  	DECQ	CX
  1131  	SHRQ	$7, CX
  1132  	
  1133  aesloop:
  1134  	// scramble state
  1135  	AESENC	X8, X8
  1136  	AESENC	X9, X9
  1137  	AESENC	X10, X10
  1138  	AESENC	X11, X11
  1139  	AESENC	X12, X12
  1140  	AESENC	X13, X13
  1141  	AESENC	X14, X14
  1142  	AESENC	X15, X15
  1143  
  1144  	// scramble state, xor in a block
  1145  	MOVOU	(AX), X0
  1146  	MOVOU	16(AX), X1
  1147  	MOVOU	32(AX), X2
  1148  	MOVOU	48(AX), X3
  1149  	AESENC	X0, X8
  1150  	AESENC	X1, X9
  1151  	AESENC	X2, X10
  1152  	AESENC	X3, X11
  1153  	MOVOU	64(AX), X4
  1154  	MOVOU	80(AX), X5
  1155  	MOVOU	96(AX), X6
  1156  	MOVOU	112(AX), X7
  1157  	AESENC	X4, X12
  1158  	AESENC	X5, X13
  1159  	AESENC	X6, X14
  1160  	AESENC	X7, X15
  1161  
  1162  	ADDQ	$128, AX
  1163  	DECQ	CX
  1164  	JNE	aesloop
  1165  
  1166  	// 3 more scrambles to finish
  1167  	AESENC	X8, X8
  1168  	AESENC	X9, X9
  1169  	AESENC	X10, X10
  1170  	AESENC	X11, X11
  1171  	AESENC	X12, X12
  1172  	AESENC	X13, X13
  1173  	AESENC	X14, X14
  1174  	AESENC	X15, X15
  1175  	AESENC	X8, X8
  1176  	AESENC	X9, X9
  1177  	AESENC	X10, X10
  1178  	AESENC	X11, X11
  1179  	AESENC	X12, X12
  1180  	AESENC	X13, X13
  1181  	AESENC	X14, X14
  1182  	AESENC	X15, X15
  1183  	AESENC	X8, X8
  1184  	AESENC	X9, X9
  1185  	AESENC	X10, X10
  1186  	AESENC	X11, X11
  1187  	AESENC	X12, X12
  1188  	AESENC	X13, X13
  1189  	AESENC	X14, X14
  1190  	AESENC	X15, X15
  1191  
  1192  	PXOR	X12, X8
  1193  	PXOR	X13, X9
  1194  	PXOR	X14, X10
  1195  	PXOR	X15, X11
  1196  	PXOR	X10, X8
  1197  	PXOR	X11, X9
  1198  	PXOR	X9, X8
  1199  	MOVQ	X8, (DX)
  1200  	RET
  1201  	
  1202  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1203  	MOVQ	p+0(FP), AX	// ptr to data
  1204  	MOVQ	h+8(FP), X0	// seed
  1205  	PINSRD	$2, (AX), X0	// data
  1206  	AESENC	runtime·aeskeysched+0(SB), X0
  1207  	AESENC	runtime·aeskeysched+16(SB), X0
  1208  	AESENC	runtime·aeskeysched+32(SB), X0
  1209  	MOVQ	X0, ret+16(FP)
  1210  	RET
  1211  
  1212  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1213  	MOVQ	p+0(FP), AX	// ptr to data
  1214  	MOVQ	h+8(FP), X0	// seed
  1215  	PINSRQ	$1, (AX), X0	// data
  1216  	AESENC	runtime·aeskeysched+0(SB), X0
  1217  	AESENC	runtime·aeskeysched+16(SB), X0
  1218  	AESENC	runtime·aeskeysched+32(SB), X0
  1219  	MOVQ	X0, ret+16(FP)
  1220  	RET
  1221  
  1222  // simple mask to get rid of data in the high part of the register.
  1223  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1224  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1225  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1226  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1227  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1228  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1229  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1230  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1231  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1232  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1233  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1234  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1235  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1236  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1237  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1238  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1239  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1240  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1241  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1242  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1243  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1244  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1245  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1246  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1247  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1248  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1249  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1250  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1251  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1252  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1253  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1254  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1255  GLOBL masks<>(SB),RODATA,$256
  1256  
  1257  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1258  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1259  	MOVQ	$masks<>(SB), AX
  1260  	MOVQ	$shifts<>(SB), BX
  1261  	ORQ	BX, AX
  1262  	TESTQ	$15, AX
  1263  	SETEQ	ret+0(FP)
  1264  	RET
  1265  
  1266  // these are arguments to pshufb. They move data down from
  1267  // the high bytes of the register to the low bytes of the register.
  1268  // index is how many bytes to move.
  1269  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1270  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1271  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1272  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1273  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1274  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1275  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1276  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1277  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1278  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1279  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1280  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1281  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1282  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1283  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1284  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1285  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1286  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1287  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1288  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1289  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1290  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1291  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1292  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1293  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1294  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1295  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1296  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1297  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1298  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1299  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1300  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1301  GLOBL shifts<>(SB),RODATA,$256
  1302  
  1303  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1304  TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1305  	MOVQ	a+0(FP), SI
  1306  	MOVQ	b+8(FP), DI
  1307  	CMPQ	SI, DI
  1308  	JEQ	eq
  1309  	MOVQ	size+16(FP), BX
  1310  	LEAQ	ret+24(FP), AX
  1311  	JMP	runtime·memeqbody(SB)
  1312  eq:
  1313  	MOVB	$1, ret+24(FP)
  1314  	RET
  1315  
  1316  // memequal_varlen(a, b unsafe.Pointer) bool
  1317  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1318  	MOVQ	a+0(FP), SI
  1319  	MOVQ	b+8(FP), DI
  1320  	CMPQ	SI, DI
  1321  	JEQ	eq
  1322  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1323  	LEAQ	ret+16(FP), AX
  1324  	JMP	runtime·memeqbody(SB)
  1325  eq:
  1326  	MOVB	$1, ret+16(FP)
  1327  	RET
  1328  
  1329  // eqstring tests whether two strings are equal.
  1330  // The compiler guarantees that strings passed
  1331  // to eqstring have equal length.
  1332  // See runtime_test.go:eqstring_generic for
  1333  // equivalent Go code.
  1334  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1335  	MOVQ	s1_base+0(FP), SI
  1336  	MOVQ	s2_base+16(FP), DI
  1337  	CMPQ	SI, DI
  1338  	JEQ	eq
  1339  	MOVQ	s1_len+8(FP), BX
  1340  	LEAQ	ret+32(FP), AX
  1341  	JMP	runtime·memeqbody(SB)
  1342  eq:
  1343  	MOVB	$1, ret+32(FP)
  1344  	RET
  1345  
  1346  // a in SI
  1347  // b in DI
  1348  // count in BX
  1349  // address of result byte in AX
  1350  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1351  	CMPQ	BX, $8
  1352  	JB	small
  1353  	CMPQ	BX, $64
  1354  	JB	bigloop
  1355  	CMPB    runtime·support_avx2(SB), $1
  1356  	JE	hugeloop_avx2
  1357  	
  1358  	// 64 bytes at a time using xmm registers
  1359  hugeloop:
  1360  	CMPQ	BX, $64
  1361  	JB	bigloop
  1362  	MOVOU	(SI), X0
  1363  	MOVOU	(DI), X1
  1364  	MOVOU	16(SI), X2
  1365  	MOVOU	16(DI), X3
  1366  	MOVOU	32(SI), X4
  1367  	MOVOU	32(DI), X5
  1368  	MOVOU	48(SI), X6
  1369  	MOVOU	48(DI), X7
  1370  	PCMPEQB	X1, X0
  1371  	PCMPEQB	X3, X2
  1372  	PCMPEQB	X5, X4
  1373  	PCMPEQB	X7, X6
  1374  	PAND	X2, X0
  1375  	PAND	X6, X4
  1376  	PAND	X4, X0
  1377  	PMOVMSKB X0, DX
  1378  	ADDQ	$64, SI
  1379  	ADDQ	$64, DI
  1380  	SUBQ	$64, BX
  1381  	CMPL	DX, $0xffff
  1382  	JEQ	hugeloop
  1383  	MOVB	$0, (AX)
  1384  	RET
  1385  
  1386  	// 64 bytes at a time using ymm registers
  1387  hugeloop_avx2:
  1388  	CMPQ	BX, $64
  1389  	JB	bigloop_avx2
  1390  	VMOVDQU	(SI), Y0
  1391  	VMOVDQU	(DI), Y1
  1392  	VMOVDQU	32(SI), Y2
  1393  	VMOVDQU	32(DI), Y3
  1394  	VPCMPEQB	Y1, Y0, Y4
  1395  	VPCMPEQB	Y2, Y3, Y5
  1396  	VPAND	Y4, Y5, Y6
  1397  	VPMOVMSKB Y6, DX
  1398  	ADDQ	$64, SI
  1399  	ADDQ	$64, DI
  1400  	SUBQ	$64, BX
  1401  	CMPL	DX, $0xffffffff
  1402  	JEQ	hugeloop_avx2
  1403  	VZEROUPPER
  1404  	MOVB	$0, (AX)
  1405  	RET
  1406  
  1407  bigloop_avx2:
  1408  	VZEROUPPER
  1409  
  1410  	// 8 bytes at a time using 64-bit register
  1411  bigloop:
  1412  	CMPQ	BX, $8
  1413  	JBE	leftover
  1414  	MOVQ	(SI), CX
  1415  	MOVQ	(DI), DX
  1416  	ADDQ	$8, SI
  1417  	ADDQ	$8, DI
  1418  	SUBQ	$8, BX
  1419  	CMPQ	CX, DX
  1420  	JEQ	bigloop
  1421  	MOVB	$0, (AX)
  1422  	RET
  1423  
  1424  	// remaining 0-8 bytes
  1425  leftover:
  1426  	MOVQ	-8(SI)(BX*1), CX
  1427  	MOVQ	-8(DI)(BX*1), DX
  1428  	CMPQ	CX, DX
  1429  	SETEQ	(AX)
  1430  	RET
  1431  
  1432  small:
  1433  	CMPQ	BX, $0
  1434  	JEQ	equal
  1435  
  1436  	LEAQ	0(BX*8), CX
  1437  	NEGQ	CX
  1438  
  1439  	CMPB	SI, $0xf8
  1440  	JA	si_high
  1441  
  1442  	// load at SI won't cross a page boundary.
  1443  	MOVQ	(SI), SI
  1444  	JMP	si_finish
  1445  si_high:
  1446  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1447  	MOVQ	-8(SI)(BX*1), SI
  1448  	SHRQ	CX, SI
  1449  si_finish:
  1450  
  1451  	// same for DI.
  1452  	CMPB	DI, $0xf8
  1453  	JA	di_high
  1454  	MOVQ	(DI), DI
  1455  	JMP	di_finish
  1456  di_high:
  1457  	MOVQ	-8(DI)(BX*1), DI
  1458  	SHRQ	CX, DI
  1459  di_finish:
  1460  
  1461  	SUBQ	SI, DI
  1462  	SHLQ	CX, DI
  1463  equal:
  1464  	SETEQ	(AX)
  1465  	RET
  1466  
  1467  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1468  	MOVQ	s1_base+0(FP), SI
  1469  	MOVQ	s1_len+8(FP), BX
  1470  	MOVQ	s2_base+16(FP), DI
  1471  	MOVQ	s2_len+24(FP), DX
  1472  	LEAQ	ret+32(FP), R9
  1473  	JMP	runtime·cmpbody(SB)
  1474  
  1475  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1476  	MOVQ	s1+0(FP), SI
  1477  	MOVQ	s1+8(FP), BX
  1478  	MOVQ	s2+24(FP), DI
  1479  	MOVQ	s2+32(FP), DX
  1480  	LEAQ	res+48(FP), R9
  1481  	JMP	runtime·cmpbody(SB)
  1482  
  1483  // input:
  1484  //   SI = a
  1485  //   DI = b
  1486  //   BX = alen
  1487  //   DX = blen
  1488  //   R9 = address of output word (stores -1/0/1 here)
  1489  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1490  	CMPQ	SI, DI
  1491  	JEQ	allsame
  1492  	CMPQ	BX, DX
  1493  	MOVQ	DX, R8
  1494  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1495  	CMPQ	R8, $8
  1496  	JB	small
  1497  
  1498  	CMPQ	R8, $63
  1499  	JBE	loop
  1500  	CMPB    runtime·support_avx2(SB), $1
  1501  	JEQ     big_loop_avx2
  1502  	JMP	big_loop
  1503  loop:
  1504  	CMPQ	R8, $16
  1505  	JBE	_0through16
  1506  	MOVOU	(SI), X0
  1507  	MOVOU	(DI), X1
  1508  	PCMPEQB X0, X1
  1509  	PMOVMSKB X1, AX
  1510  	XORQ	$0xffff, AX	// convert EQ to NE
  1511  	JNE	diff16	// branch if at least one byte is not equal
  1512  	ADDQ	$16, SI
  1513  	ADDQ	$16, DI
  1514  	SUBQ	$16, R8
  1515  	JMP	loop
  1516  	
  1517  diff64:
  1518  	ADDQ	$48, SI
  1519  	ADDQ	$48, DI
  1520  	JMP	diff16
  1521  diff48:
  1522  	ADDQ	$32, SI
  1523  	ADDQ	$32, DI
  1524  	JMP	diff16
  1525  diff32:
  1526  	ADDQ	$16, SI
  1527  	ADDQ	$16, DI
  1528  	// AX = bit mask of differences
  1529  diff16:
  1530  	BSFQ	AX, BX	// index of first byte that differs
  1531  	XORQ	AX, AX
  1532  	MOVB	(SI)(BX*1), CX
  1533  	CMPB	CX, (DI)(BX*1)
  1534  	SETHI	AX
  1535  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1536  	MOVQ	AX, (R9)
  1537  	RET
  1538  
  1539  	// 0 through 16 bytes left, alen>=8, blen>=8
  1540  _0through16:
  1541  	CMPQ	R8, $8
  1542  	JBE	_0through8
  1543  	MOVQ	(SI), AX
  1544  	MOVQ	(DI), CX
  1545  	CMPQ	AX, CX
  1546  	JNE	diff8
  1547  _0through8:
  1548  	MOVQ	-8(SI)(R8*1), AX
  1549  	MOVQ	-8(DI)(R8*1), CX
  1550  	CMPQ	AX, CX
  1551  	JEQ	allsame
  1552  
  1553  	// AX and CX contain parts of a and b that differ.
  1554  diff8:
  1555  	BSWAPQ	AX	// reverse order of bytes
  1556  	BSWAPQ	CX
  1557  	XORQ	AX, CX
  1558  	BSRQ	CX, CX	// index of highest bit difference
  1559  	SHRQ	CX, AX	// move a's bit to bottom
  1560  	ANDQ	$1, AX	// mask bit
  1561  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1562  	MOVQ	AX, (R9)
  1563  	RET
  1564  
  1565  	// 0-7 bytes in common
  1566  small:
  1567  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1568  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1569  	JEQ	allsame
  1570  
  1571  	// load bytes of a into high bytes of AX
  1572  	CMPB	SI, $0xf8
  1573  	JA	si_high
  1574  	MOVQ	(SI), SI
  1575  	JMP	si_finish
  1576  si_high:
  1577  	MOVQ	-8(SI)(R8*1), SI
  1578  	SHRQ	CX, SI
  1579  si_finish:
  1580  	SHLQ	CX, SI
  1581  
  1582  	// load bytes of b in to high bytes of BX
  1583  	CMPB	DI, $0xf8
  1584  	JA	di_high
  1585  	MOVQ	(DI), DI
  1586  	JMP	di_finish
  1587  di_high:
  1588  	MOVQ	-8(DI)(R8*1), DI
  1589  	SHRQ	CX, DI
  1590  di_finish:
  1591  	SHLQ	CX, DI
  1592  
  1593  	BSWAPQ	SI	// reverse order of bytes
  1594  	BSWAPQ	DI
  1595  	XORQ	SI, DI	// find bit differences
  1596  	JEQ	allsame
  1597  	BSRQ	DI, CX	// index of highest bit difference
  1598  	SHRQ	CX, SI	// move a's bit to bottom
  1599  	ANDQ	$1, SI	// mask bit
  1600  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1601  	MOVQ	AX, (R9)
  1602  	RET
  1603  
  1604  allsame:
  1605  	XORQ	AX, AX
  1606  	XORQ	CX, CX
  1607  	CMPQ	BX, DX
  1608  	SETGT	AX	// 1 if alen > blen
  1609  	SETEQ	CX	// 1 if alen == blen
  1610  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1611  	MOVQ	AX, (R9)
  1612  	RET
  1613  
  1614  	// this works for >= 64 bytes of data.
  1615  big_loop:
  1616  	MOVOU	(SI), X0
  1617  	MOVOU	(DI), X1
  1618  	PCMPEQB X0, X1
  1619  	PMOVMSKB X1, AX
  1620  	XORQ	$0xffff, AX
  1621  	JNE	diff16
  1622  
  1623  	MOVOU	16(SI), X0
  1624  	MOVOU	16(DI), X1
  1625  	PCMPEQB X0, X1
  1626  	PMOVMSKB X1, AX
  1627  	XORQ	$0xffff, AX
  1628  	JNE	diff32
  1629  
  1630  	MOVOU	32(SI), X0
  1631  	MOVOU	32(DI), X1
  1632  	PCMPEQB X0, X1
  1633  	PMOVMSKB X1, AX
  1634  	XORQ	$0xffff, AX
  1635  	JNE	diff48
  1636  
  1637  	MOVOU	48(SI), X0
  1638  	MOVOU	48(DI), X1
  1639  	PCMPEQB X0, X1
  1640  	PMOVMSKB X1, AX
  1641  	XORQ	$0xffff, AX
  1642  	JNE	diff64
  1643  
  1644  	ADDQ	$64, SI
  1645  	ADDQ	$64, DI
  1646  	SUBQ	$64, R8
  1647  	CMPQ	R8, $64
  1648  	JBE	loop
  1649  	JMP	big_loop
  1650  
  1651  	// Compare 64-bytes per loop iteration.
  1652  	// Loop is unrolled and uses AVX2.
  1653  big_loop_avx2:
  1654  	VMOVDQU	(SI), Y2
  1655  	VMOVDQU	(DI), Y3
  1656  	VMOVDQU	32(SI), Y4
  1657  	VMOVDQU	32(DI), Y5
  1658  	VPCMPEQB Y2, Y3, Y0
  1659  	VPMOVMSKB Y0, AX
  1660  	XORL	$0xffffffff, AX
  1661  	JNE	diff32_avx2
  1662  	VPCMPEQB Y4, Y5, Y6
  1663  	VPMOVMSKB Y6, AX
  1664  	XORL	$0xffffffff, AX
  1665  	JNE	diff64_avx2
  1666  
  1667  	ADDQ	$64, SI
  1668  	ADDQ	$64, DI
  1669  	SUBQ	$64, R8
  1670  	CMPQ	R8, $64
  1671  	JB	big_loop_avx2_exit
  1672  	JMP	big_loop_avx2
  1673  
  1674  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1675  diff32_avx2:
  1676  	VZEROUPPER
  1677  	JMP diff16
  1678  
  1679  	// Same as diff32_avx2, but for last 32 bytes.
  1680  diff64_avx2:
  1681  	VZEROUPPER
  1682  	JMP diff48
  1683  
  1684  	// For <64 bytes remainder jump to normal loop.
  1685  big_loop_avx2_exit:
  1686  	VZEROUPPER
  1687  	JMP loop
  1688  
  1689  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1690  	MOVQ s+0(FP), DI
  1691  	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1692  	MOVQ s_len+8(FP), DX
  1693  	MOVQ c+16(FP), BP
  1694  	MOVQ c_len+24(FP), AX
  1695  	MOVQ DI, R10
  1696  	LEAQ ret+32(FP), R11
  1697  	JMP  runtime·indexShortStr(SB)
  1698  
  1699  TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
  1700  	MOVQ s+0(FP), DI
  1701  	MOVQ s_len+8(FP), DX
  1702  	MOVQ c+24(FP), BP
  1703  	MOVQ c_len+32(FP), AX
  1704  	MOVQ DI, R10
  1705  	LEAQ ret+48(FP), R11
  1706  	JMP  runtime·indexShortStr(SB)
  1707  
  1708  // AX: length of string, that we are searching for
  1709  // DX: length of string, in which we are searching
  1710  // DI: pointer to string, in which we are searching
  1711  // BP: pointer to string, that we are searching for
  1712  // R11: address, where to put return value
  1713  TEXT runtime·indexShortStr(SB),NOSPLIT,$0
  1714  	CMPQ AX, DX
  1715  	JA fail
  1716  	CMPQ DX, $16
  1717  	JAE sse42
  1718  no_sse42:
  1719  	CMPQ AX, $2
  1720  	JA   _3_or_more
  1721  	MOVW (BP), BP
  1722  	LEAQ -1(DI)(DX*1), DX
  1723  loop2:
  1724  	MOVW (DI), SI
  1725  	CMPW SI,BP
  1726  	JZ success
  1727  	ADDQ $1,DI
  1728  	CMPQ DI,DX
  1729  	JB loop2
  1730  	JMP fail
  1731  _3_or_more:
  1732  	CMPQ AX, $3
  1733  	JA   _4_or_more
  1734  	MOVW 1(BP), BX
  1735  	MOVW (BP), BP
  1736  	LEAQ -2(DI)(DX*1), DX
  1737  loop3:
  1738  	MOVW (DI), SI
  1739  	CMPW SI,BP
  1740  	JZ   partial_success3
  1741  	ADDQ $1,DI
  1742  	CMPQ DI,DX
  1743  	JB loop3
  1744  	JMP fail
  1745  partial_success3:
  1746  	MOVW 1(DI), SI
  1747  	CMPW SI,BX
  1748  	JZ success
  1749  	ADDQ $1,DI
  1750  	CMPQ DI,DX
  1751  	JB loop3
  1752  	JMP fail
  1753  _4_or_more:
  1754  	CMPQ AX, $4
  1755  	JA   _5_or_more
  1756  	MOVL (BP), BP
  1757  	LEAQ -3(DI)(DX*1), DX
  1758  loop4:
  1759  	MOVL (DI), SI
  1760  	CMPL SI,BP
  1761  	JZ   success
  1762  	ADDQ $1,DI
  1763  	CMPQ DI,DX
  1764  	JB loop4
  1765  	JMP fail
  1766  _5_or_more:
  1767  	CMPQ AX, $7
  1768  	JA   _8_or_more
  1769  	LEAQ 1(DI)(DX*1), DX
  1770  	SUBQ AX, DX
  1771  	MOVL -4(BP)(AX*1), BX
  1772  	MOVL (BP), BP
  1773  loop5to7:
  1774  	MOVL (DI), SI
  1775  	CMPL SI,BP
  1776  	JZ   partial_success5to7
  1777  	ADDQ $1,DI
  1778  	CMPQ DI,DX
  1779  	JB loop5to7
  1780  	JMP fail
  1781  partial_success5to7:
  1782  	MOVL -4(AX)(DI*1), SI
  1783  	CMPL SI,BX
  1784  	JZ success
  1785  	ADDQ $1,DI
  1786  	CMPQ DI,DX
  1787  	JB loop5to7
  1788  	JMP fail
  1789  _8_or_more:
  1790  	CMPQ AX, $8
  1791  	JA   _9_or_more
  1792  	MOVQ (BP), BP
  1793  	LEAQ -7(DI)(DX*1), DX
  1794  loop8:
  1795  	MOVQ (DI), SI
  1796  	CMPQ SI,BP
  1797  	JZ   success
  1798  	ADDQ $1,DI
  1799  	CMPQ DI,DX
  1800  	JB loop8
  1801  	JMP fail
  1802  _9_or_more:
  1803  	CMPQ AX, $15
  1804  	JA   _16_or_more
  1805  	LEAQ 1(DI)(DX*1), DX
  1806  	SUBQ AX, DX
  1807  	MOVQ -8(BP)(AX*1), BX
  1808  	MOVQ (BP), BP
  1809  loop9to15:
  1810  	MOVQ (DI), SI
  1811  	CMPQ SI,BP
  1812  	JZ   partial_success9to15
  1813  	ADDQ $1,DI
  1814  	CMPQ DI,DX
  1815  	JB loop9to15
  1816  	JMP fail
  1817  partial_success9to15:
  1818  	MOVQ -8(AX)(DI*1), SI
  1819  	CMPQ SI,BX
  1820  	JZ success
  1821  	ADDQ $1,DI
  1822  	CMPQ DI,DX
  1823  	JB loop9to15
  1824  	JMP fail
  1825  _16_or_more:
  1826  	CMPQ AX, $16
  1827  	JA   _17_or_more
  1828  	MOVOU (BP), X1
  1829  	LEAQ -15(DI)(DX*1), DX
  1830  loop16:
  1831  	MOVOU (DI), X2
  1832  	PCMPEQB X1, X2
  1833  	PMOVMSKB X2, SI
  1834  	CMPQ  SI, $0xffff
  1835  	JE   success
  1836  	ADDQ $1,DI
  1837  	CMPQ DI,DX
  1838  	JB loop16
  1839  	JMP fail
  1840  _17_or_more:
  1841  	CMPQ AX, $31
  1842  	JA   _32_or_more
  1843  	LEAQ 1(DI)(DX*1), DX
  1844  	SUBQ AX, DX
  1845  	MOVOU -16(BP)(AX*1), X0
  1846  	MOVOU (BP), X1
  1847  loop17to31:
  1848  	MOVOU (DI), X2
  1849  	PCMPEQB X1,X2
  1850  	PMOVMSKB X2, SI
  1851  	CMPQ  SI, $0xffff
  1852  	JE   partial_success17to31
  1853  	ADDQ $1,DI
  1854  	CMPQ DI,DX
  1855  	JB loop17to31
  1856  	JMP fail
  1857  partial_success17to31:
  1858  	MOVOU -16(AX)(DI*1), X3
  1859  	PCMPEQB X0, X3
  1860  	PMOVMSKB X3, SI
  1861  	CMPQ  SI, $0xffff
  1862  	JE success
  1863  	ADDQ $1,DI
  1864  	CMPQ DI,DX
  1865  	JB loop17to31
  1866  	JMP fail
  1867  // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
  1868  // So no need to check cpuid
  1869  _32_or_more:
  1870  	CMPQ AX, $32
  1871  	JA   _33_to_63
  1872  	VMOVDQU (BP), Y1
  1873  	LEAQ -31(DI)(DX*1), DX
  1874  loop32:
  1875  	VMOVDQU (DI), Y2
  1876  	VPCMPEQB Y1, Y2, Y3
  1877  	VPMOVMSKB Y3, SI
  1878  	CMPL  SI, $0xffffffff
  1879  	JE   success_avx2
  1880  	ADDQ $1,DI
  1881  	CMPQ DI,DX
  1882  	JB loop32
  1883  	JMP fail_avx2
  1884  _33_to_63:
  1885  	LEAQ 1(DI)(DX*1), DX
  1886  	SUBQ AX, DX
  1887  	VMOVDQU -32(BP)(AX*1), Y0
  1888  	VMOVDQU (BP), Y1
  1889  loop33to63:
  1890  	VMOVDQU (DI), Y2
  1891  	VPCMPEQB Y1, Y2, Y3
  1892  	VPMOVMSKB Y3, SI
  1893  	CMPL  SI, $0xffffffff
  1894  	JE   partial_success33to63
  1895  	ADDQ $1,DI
  1896  	CMPQ DI,DX
  1897  	JB loop33to63
  1898  	JMP fail_avx2
  1899  partial_success33to63:
  1900  	VMOVDQU -32(AX)(DI*1), Y3
  1901  	VPCMPEQB Y0, Y3, Y4
  1902  	VPMOVMSKB Y4, SI
  1903  	CMPL  SI, $0xffffffff
  1904  	JE success_avx2
  1905  	ADDQ $1,DI
  1906  	CMPQ DI,DX
  1907  	JB loop33to63
  1908  fail_avx2:
  1909  	VZEROUPPER
  1910  fail:
  1911  	MOVQ $-1, (R11)
  1912  	RET
  1913  success_avx2:
  1914  	VZEROUPPER
  1915  	JMP success
  1916  sse42:
  1917  	CMPB runtime·support_sse42(SB), $1
  1918  	JNE no_sse42
  1919  	CMPQ AX, $12
  1920  	// PCMPESTRI is slower than normal compare,
  1921  	// so using it makes sense only if we advance 4+ bytes per compare
  1922  	// This value was determined experimentally and is the ~same
  1923  	// on Nehalem (first with SSE42) and Haswell.
  1924  	JAE _9_or_more
  1925  	LEAQ 16(BP), SI
  1926  	TESTW $0xff0, SI
  1927  	JEQ no_sse42
  1928  	MOVOU (BP), X1
  1929  	LEAQ -15(DI)(DX*1), SI
  1930  	MOVQ $16, R9
  1931  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1932  loop_sse42:
  1933  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1934  	// for equality (bits 2,3 are 11)
  1935  	// result is not masked or inverted (bits 4,5 are 00)
  1936  	// and corresponds to first matching byte (bit 6 is 0)
  1937  	PCMPESTRI $0x0c, (DI), X1
  1938  	// CX == 16 means no match,
  1939  	// CX > R9 means partial match at the end of the string,
  1940  	// otherwise sep is at offset CX from X1 start
  1941  	CMPQ CX, R9
  1942  	JBE sse42_success
  1943  	ADDQ R9, DI
  1944  	CMPQ DI, SI
  1945  	JB loop_sse42
  1946  	PCMPESTRI $0x0c, -1(SI), X1
  1947  	CMPQ CX, R9
  1948  	JA fail
  1949  	LEAQ -1(SI), DI
  1950  sse42_success:
  1951  	ADDQ CX, DI
  1952  success:
  1953  	SUBQ R10, DI
  1954  	MOVQ DI, (R11)
  1955  	RET
  1956  
  1957  
  1958  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1959  	MOVQ s+0(FP), SI
  1960  	MOVQ s_len+8(FP), BX
  1961  	MOVB c+24(FP), AL
  1962  	LEAQ ret+32(FP), R8
  1963  	JMP  runtime·indexbytebody(SB)
  1964  
  1965  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1966  	MOVQ s+0(FP), SI
  1967  	MOVQ s_len+8(FP), BX
  1968  	MOVB c+16(FP), AL
  1969  	LEAQ ret+24(FP), R8
  1970  	JMP  runtime·indexbytebody(SB)
  1971  
  1972  // input:
  1973  //   SI: data
  1974  //   BX: data len
  1975  //   AL: byte sought
  1976  //   R8: address to put result
  1977  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1978  	// Shuffle X0 around so that each byte contains
  1979  	// the character we're looking for.
  1980  	MOVD AX, X0
  1981  	PUNPCKLBW X0, X0
  1982  	PUNPCKLBW X0, X0
  1983  	PSHUFL $0, X0, X0
  1984  	
  1985  	CMPQ BX, $16
  1986  	JLT small
  1987  
  1988  	MOVQ SI, DI
  1989  
  1990  	CMPQ BX, $32
  1991  	JA avx2
  1992  sse:
  1993  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  1994  	JMP	sseloopentry
  1995  	
  1996  sseloop:
  1997  	// Move the next 16-byte chunk of the data into X1.
  1998  	MOVOU	(DI), X1
  1999  	// Compare bytes in X0 to X1.
  2000  	PCMPEQB	X0, X1
  2001  	// Take the top bit of each byte in X1 and put the result in DX.
  2002  	PMOVMSKB X1, DX
  2003  	// Find first set bit, if any.
  2004  	BSFL	DX, DX
  2005  	JNZ	ssesuccess
  2006  	// Advance to next block.
  2007  	ADDQ	$16, DI
  2008  sseloopentry:
  2009  	CMPQ	DI, AX
  2010  	JB	sseloop
  2011  
  2012  	// Search the last 16-byte chunk. This chunk may overlap with the
  2013  	// chunks we've already searched, but that's ok.
  2014  	MOVQ	AX, DI
  2015  	MOVOU	(AX), X1
  2016  	PCMPEQB	X0, X1
  2017  	PMOVMSKB X1, DX
  2018  	BSFL	DX, DX
  2019  	JNZ	ssesuccess
  2020  
  2021  failure:
  2022  	MOVQ $-1, (R8)
  2023  	RET
  2024  
  2025  // We've found a chunk containing the byte.
  2026  // The chunk was loaded from DI.
  2027  // The index of the matching byte in the chunk is DX.
  2028  // The start of the data is SI.
  2029  ssesuccess:
  2030  	SUBQ SI, DI	// Compute offset of chunk within data.
  2031  	ADDQ DX, DI	// Add offset of byte within chunk.
  2032  	MOVQ DI, (R8)
  2033  	RET
  2034  
  2035  // handle for lengths < 16
  2036  small:
  2037  	TESTQ	BX, BX
  2038  	JEQ	failure
  2039  
  2040  	// Check if we'll load across a page boundary.
  2041  	LEAQ	16(SI), AX
  2042  	TESTW	$0xff0, AX
  2043  	JEQ	endofpage
  2044  
  2045  	MOVOU	(SI), X1 // Load data
  2046  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2047  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2048  	BSFL	DX, DX	// Find first set bit.
  2049  	JZ	failure	// No set bit, failure.
  2050  	CMPL	DX, BX
  2051  	JAE	failure	// Match is past end of data.
  2052  	MOVQ	DX, (R8)
  2053  	RET
  2054  
  2055  endofpage:
  2056  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2057  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2058  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2059  	MOVL	BX, CX
  2060  	SHLL	CX, DX
  2061  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2062  	BSFL	DX, DX	// Find first set bit.
  2063  	JZ	failure	// No set bit, failure.
  2064  	MOVQ	DX, (R8)
  2065  	RET
  2066  
  2067  avx2:
  2068  	CMPB   runtime·support_avx2(SB), $1
  2069  	JNE sse
  2070  	MOVD AX, X0
  2071  	LEAQ -32(SI)(BX*1), R11
  2072  	VPBROADCASTB  X0, Y1
  2073  avx2_loop:
  2074  	VMOVDQU (DI), Y2
  2075  	VPCMPEQB Y1, Y2, Y3
  2076  	VPTEST Y3, Y3
  2077  	JNZ avx2success
  2078  	ADDQ $32, DI
  2079  	CMPQ DI, R11
  2080  	JLT avx2_loop
  2081  	MOVQ R11, DI
  2082  	VMOVDQU (DI), Y2
  2083  	VPCMPEQB Y1, Y2, Y3
  2084  	VPTEST Y3, Y3
  2085  	JNZ avx2success
  2086  	VZEROUPPER
  2087  	MOVQ $-1, (R8)
  2088  	RET
  2089  
  2090  avx2success:
  2091  	VPMOVMSKB Y3, DX
  2092  	BSFL DX, DX
  2093  	SUBQ SI, DI
  2094  	ADDQ DI, DX
  2095  	MOVQ DX, (R8)
  2096  	VZEROUPPER
  2097  	RET
  2098  
  2099  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2100  	MOVQ	a_len+8(FP), BX
  2101  	MOVQ	b_len+32(FP), CX
  2102  	CMPQ	BX, CX
  2103  	JNE	eqret
  2104  	MOVQ	a+0(FP), SI
  2105  	MOVQ	b+24(FP), DI
  2106  	LEAQ	ret+48(FP), AX
  2107  	JMP	runtime·memeqbody(SB)
  2108  eqret:
  2109  	MOVB	$0, ret+48(FP)
  2110  	RET
  2111  
  2112  
  2113  TEXT bytes·countByte(SB),NOSPLIT,$0-40
  2114  	MOVQ s+0(FP), SI
  2115  	MOVQ s_len+8(FP), BX
  2116  	MOVB c+24(FP), AL
  2117  	LEAQ ret+32(FP), R8
  2118  	JMP  runtime·countByte(SB)
  2119  
  2120  TEXT strings·countByte(SB),NOSPLIT,$0-32
  2121  	MOVQ s+0(FP), SI
  2122  	MOVQ s_len+8(FP), BX
  2123  	MOVB c+16(FP), AL
  2124  	LEAQ ret+24(FP), R8
  2125  	JMP  runtime·countByte(SB)
  2126  
  2127  // input:
  2128  //   SI: data
  2129  //   BX: data len
  2130  //   AL: byte sought
  2131  //   R8: address to put result
  2132  // This requires the POPCNT instruction
  2133  TEXT runtime·countByte(SB),NOSPLIT,$0
  2134  	// Shuffle X0 around so that each byte contains
  2135  	// the character we're looking for.
  2136  	MOVD AX, X0
  2137  	PUNPCKLBW X0, X0
  2138  	PUNPCKLBW X0, X0
  2139  	PSHUFL $0, X0, X0
  2140  
  2141  	CMPQ BX, $16
  2142  	JLT small
  2143  
  2144  	MOVQ $0, R12 // Accumulator
  2145  
  2146  	MOVQ SI, DI
  2147  
  2148  	CMPQ BX, $32
  2149  	JA avx2
  2150  sse:
  2151  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2152  	JMP	sseloopentry
  2153  
  2154  sseloop:
  2155  	// Move the next 16-byte chunk of the data into X1.
  2156  	MOVOU	(DI), X1
  2157  	// Compare bytes in X0 to X1.
  2158  	PCMPEQB	X0, X1
  2159  	// Take the top bit of each byte in X1 and put the result in DX.
  2160  	PMOVMSKB X1, DX
  2161  	// Count number of matching bytes
  2162  	POPCNTL DX, DX
  2163  	// Accumulate into R12
  2164  	ADDQ DX, R12
  2165  	// Advance to next block.
  2166  	ADDQ	$16, DI
  2167  sseloopentry:
  2168  	CMPQ	DI, AX
  2169  	JBE	sseloop
  2170  
  2171  	// Get the number of bytes to consider in the last 16 bytes
  2172  	ANDQ $15, BX
  2173  	JZ end
  2174  
  2175  	// Create mask to ignore overlap between previous 16 byte block
  2176  	// and the next.
  2177  	MOVQ $16,CX
  2178  	SUBQ BX, CX
  2179  	MOVQ $0xFFFF, R10
  2180  	SARQ CL, R10
  2181  	SALQ CL, R10
  2182  
  2183  	// Process the last 16-byte chunk. This chunk may overlap with the
  2184  	// chunks we've already searched so we need to mask part of it.
  2185  	MOVOU	(AX), X1
  2186  	PCMPEQB	X0, X1
  2187  	PMOVMSKB X1, DX
  2188  	// Apply mask
  2189  	ANDQ R10, DX
  2190  	POPCNTL DX, DX
  2191  	ADDQ DX, R12
  2192  end:
  2193  	MOVQ R12, (R8)
  2194  	RET
  2195  
  2196  // handle for lengths < 16
  2197  small:
  2198  	TESTQ	BX, BX
  2199  	JEQ	endzero
  2200  
  2201  	// Check if we'll load across a page boundary.
  2202  	LEAQ	16(SI), AX
  2203  	TESTW	$0xff0, AX
  2204  	JEQ	endofpage
  2205  
  2206  	// We must ignore high bytes as they aren't part of our slice.
  2207  	// Create mask.
  2208  	MOVB BX, CX
  2209  	MOVQ $1, R10
  2210  	SALQ CL, R10
  2211  	SUBQ $1, R10
  2212  
  2213  	// Load data
  2214  	MOVOU	(SI), X1
  2215  	// Compare target byte with each byte in data.
  2216  	PCMPEQB	X0, X1
  2217  	// Move result bits to integer register.
  2218  	PMOVMSKB X1, DX
  2219  	// Apply mask
  2220  	ANDQ R10, DX
  2221  	POPCNTL DX, DX
  2222  	// Directly return DX, we don't need to accumulate
  2223  	// since we have <16 bytes.
  2224  	MOVQ	DX, (R8)
  2225  	RET
  2226  endzero:
  2227  	MOVQ $0, (R8)
  2228  	RET
  2229  
  2230  endofpage:
  2231  	// We must ignore low bytes as they aren't part of our slice.
  2232  	MOVQ $16,CX
  2233  	SUBQ BX, CX
  2234  	MOVQ $0xFFFF, R10
  2235  	SARQ CL, R10
  2236  	SALQ CL, R10
  2237  
  2238  	// Load data into the high end of X1.
  2239  	MOVOU	-16(SI)(BX*1), X1
  2240  	// Compare target byte with each byte in data.
  2241  	PCMPEQB	X0, X1
  2242  	// Move result bits to integer register.
  2243  	PMOVMSKB X1, DX
  2244  	// Apply mask
  2245  	ANDQ R10, DX
  2246  	// Directly return DX, we don't need to accumulate
  2247  	// since we have <16 bytes.
  2248  	POPCNTL DX, DX
  2249  	MOVQ	DX, (R8)
  2250  	RET
  2251  
  2252  avx2:
  2253  	CMPB   runtime·support_avx2(SB), $1
  2254  	JNE sse
  2255  	MOVD AX, X0
  2256  	LEAQ -32(SI)(BX*1), R11
  2257  	VPBROADCASTB  X0, Y1
  2258  avx2_loop:
  2259  	VMOVDQU (DI), Y2
  2260  	VPCMPEQB Y1, Y2, Y3
  2261  	VPMOVMSKB Y3, DX
  2262  	POPCNTL DX, DX
  2263  	ADDQ DX, R12
  2264  	ADDQ $32, DI
  2265  	CMPQ DI, R11
  2266  	JLE avx2_loop
  2267  
  2268  	// If last block is already processed,
  2269  	// skip to the end.
  2270  	CMPQ DI, R11
  2271  	JEQ endavx
  2272  
  2273  	// Load address of the last 32 bytes.
  2274  	// There is an overlap with the previous block.
  2275  	MOVQ R11, DI
  2276  	VMOVDQU (DI), Y2
  2277  	VPCMPEQB Y1, Y2, Y3
  2278  	VPMOVMSKB Y3, DX
  2279  	// Exit AVX mode.
  2280  	VZEROUPPER
  2281  
  2282  	// Create mask to ignore overlap between previous 32 byte block
  2283  	// and the next.
  2284  	ANDQ $31, BX
  2285  	MOVQ $32,CX
  2286  	SUBQ BX, CX
  2287  	MOVQ $0xFFFFFFFF, R10
  2288  	SARQ CL, R10
  2289  	SALQ CL, R10
  2290  	// Apply mask
  2291  	ANDQ R10, DX
  2292  	POPCNTL DX, DX
  2293  	ADDQ DX, R12
  2294  	MOVQ R12, (R8)
  2295  	RET
  2296  endavx:
  2297  	// Exit AVX mode.
  2298  	VZEROUPPER
  2299  	MOVQ R12, (R8)
  2300  	RET
  2301  
  2302  TEXT runtime·return0(SB), NOSPLIT, $0
  2303  	MOVL	$0, AX
  2304  	RET
  2305  
  2306  
  2307  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2308  // Must obey the gcc calling convention.
  2309  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2310  	get_tls(CX)
  2311  	MOVQ	g(CX), AX
  2312  	MOVQ	g_m(AX), AX
  2313  	MOVQ	m_curg(AX), AX
  2314  	MOVQ	(g_stack+stack_hi)(AX), AX
  2315  	RET
  2316  
  2317  // The top-most function running on a goroutine
  2318  // returns to goexit+PCQuantum.
  2319  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2320  	BYTE	$0x90	// NOP
  2321  	CALL	runtime·goexit1(SB)	// does not return
  2322  	// traceback from goexit1 must hit code range of goexit
  2323  	BYTE	$0x90	// NOP
  2324  
  2325  // This is called from .init_array and follows the platform, not Go, ABI.
  2326  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2327  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2328  	MOVQ	runtime·lastmoduledatap(SB), AX
  2329  	MOVQ	DI, moduledata_next(AX)
  2330  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2331  	POPQ	R15
  2332  	RET