github.com/fenixara/go@v0.0.0-20170127160404-96ea0918e670/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	MOVQ	AX, SI
    32  	CMPQ	AX, $0
    33  	JE	nocpuinfo
    34  
    35  	// Figure out how to serialize RDTSC.
    36  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    37  	// Don't know about the rest, so let's do MFENCE.
    38  	CMPL	BX, $0x756E6547  // "Genu"
    39  	JNE	notintel
    40  	CMPL	DX, $0x49656E69  // "ineI"
    41  	JNE	notintel
    42  	CMPL	CX, $0x6C65746E  // "ntel"
    43  	JNE	notintel
    44  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    45  notintel:
    46  
    47  	// Load EAX=1 cpuid flags
    48  	MOVQ	$1, AX
    49  	CPUID
    50  	MOVL	CX, runtime·cpuid_ecx(SB)
    51  	MOVL	DX, runtime·cpuid_edx(SB)
    52  
    53  	// Load EAX=7/ECX=0 cpuid flags
    54  	CMPQ	SI, $7
    55  	JLT	no7
    56  	MOVL	$7, AX
    57  	MOVL	$0, CX
    58  	CPUID
    59  	MOVL	BX, runtime·cpuid_ebx7(SB)
    60  no7:
    61  	// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
    62  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
    63  	// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
    64  	MOVL	runtime·cpuid_ecx(SB), CX
    65  	ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
    66  	CMPL    CX, $0x18000000
    67  	JNE     noavx
    68  	MOVL    $0, CX
    69  	// For XGETBV, OSXSAVE bit is required and sufficient
    70  	XGETBV
    71  	ANDL    $6, AX
    72  	CMPL    AX, $6 // Check for OS support of YMM registers
    73  	JNE     noavx
    74  	MOVB    $1, runtime·support_avx(SB)
    75  	TESTL   $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
    76  	JEQ     noavx2
    77  	MOVB    $1, runtime·support_avx2(SB)
    78  	JMP     testbmi1
    79  noavx:
    80  	MOVB    $0, runtime·support_avx(SB)
    81  noavx2:
    82  	MOVB    $0, runtime·support_avx2(SB)
    83  testbmi1:
    84  	// Detect BMI1 and BMI2 extensions as per
    85  	// 5.1.16.1 Detection of VEX-encoded GPR Instructions,
    86  	//   LZCNT and TZCNT, PREFETCHW chapter of [1]
    87  	MOVB    $0, runtime·support_bmi1(SB)
    88  	TESTL   $(1<<3), runtime·cpuid_ebx7(SB) // check for BMI1 bit
    89  	JEQ     testbmi2
    90  	MOVB    $1, runtime·support_bmi1(SB)
    91  testbmi2:
    92  	MOVB    $0, runtime·support_bmi2(SB)
    93  	TESTL   $(1<<8), runtime·cpuid_ebx7(SB) // check for BMI2 bit
    94  	JEQ     nocpuinfo
    95  	MOVB    $1, runtime·support_bmi2(SB)
    96  nocpuinfo:	
    97  	
    98  	// if there is an _cgo_init, call it.
    99  	MOVQ	_cgo_init(SB), AX
   100  	TESTQ	AX, AX
   101  	JZ	needtls
   102  	// g0 already in DI
   103  	MOVQ	DI, CX	// Win64 uses CX for first parameter
   104  	MOVQ	$setg_gcc<>(SB), SI
   105  	CALL	AX
   106  
   107  	// update stackguard after _cgo_init
   108  	MOVQ	$runtime·g0(SB), CX
   109  	MOVQ	(g_stack+stack_lo)(CX), AX
   110  	ADDQ	$const__StackGuard, AX
   111  	MOVQ	AX, g_stackguard0(CX)
   112  	MOVQ	AX, g_stackguard1(CX)
   113  
   114  #ifndef GOOS_windows
   115  	JMP ok
   116  #endif
   117  needtls:
   118  #ifdef GOOS_plan9
   119  	// skip TLS setup on Plan 9
   120  	JMP ok
   121  #endif
   122  #ifdef GOOS_solaris
   123  	// skip TLS setup on Solaris
   124  	JMP ok
   125  #endif
   126  
   127  	LEAQ	runtime·m0+m_tls(SB), DI
   128  	CALL	runtime·settls(SB)
   129  
   130  	// store through it, to make sure it works
   131  	get_tls(BX)
   132  	MOVQ	$0x123, g(BX)
   133  	MOVQ	runtime·m0+m_tls(SB), AX
   134  	CMPQ	AX, $0x123
   135  	JEQ 2(PC)
   136  	MOVL	AX, 0	// abort
   137  ok:
   138  	// set the per-goroutine and per-mach "registers"
   139  	get_tls(BX)
   140  	LEAQ	runtime·g0(SB), CX
   141  	MOVQ	CX, g(BX)
   142  	LEAQ	runtime·m0(SB), AX
   143  
   144  	// save m->g0 = g0
   145  	MOVQ	CX, m_g0(AX)
   146  	// save m0 to g0->m
   147  	MOVQ	AX, g_m(CX)
   148  
   149  	CLD				// convention is D is always left cleared
   150  	CALL	runtime·check(SB)
   151  
   152  	MOVL	16(SP), AX		// copy argc
   153  	MOVL	AX, 0(SP)
   154  	MOVQ	24(SP), AX		// copy argv
   155  	MOVQ	AX, 8(SP)
   156  	CALL	runtime·args(SB)
   157  	CALL	runtime·osinit(SB)
   158  	CALL	runtime·schedinit(SB)
   159  
   160  	// create a new goroutine to start program
   161  	MOVQ	$runtime·mainPC(SB), AX		// entry
   162  	PUSHQ	AX
   163  	PUSHQ	$0			// arg size
   164  	CALL	runtime·newproc(SB)
   165  	POPQ	AX
   166  	POPQ	AX
   167  
   168  	// start this M
   169  	CALL	runtime·mstart(SB)
   170  
   171  	MOVL	$0xf1, 0xf1  // crash
   172  	RET
   173  
   174  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   175  GLOBL	runtime·mainPC(SB),RODATA,$8
   176  
   177  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   178  	BYTE	$0xcc
   179  	RET
   180  
   181  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   182  	// No per-thread init.
   183  	RET
   184  
   185  /*
   186   *  go-routine
   187   */
   188  
   189  // void gosave(Gobuf*)
   190  // save state in Gobuf; setjmp
   191  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   192  	MOVQ	buf+0(FP), AX		// gobuf
   193  	LEAQ	buf+0(FP), BX		// caller's SP
   194  	MOVQ	BX, gobuf_sp(AX)
   195  	MOVQ	0(SP), BX		// caller's PC
   196  	MOVQ	BX, gobuf_pc(AX)
   197  	MOVQ	$0, gobuf_ret(AX)
   198  	MOVQ	BP, gobuf_bp(AX)
   199  	// Assert ctxt is zero. See func save.
   200  	MOVQ	gobuf_ctxt(AX), BX
   201  	TESTQ	BX, BX
   202  	JZ	2(PC)
   203  	CALL	runtime·badctxt(SB)
   204  	get_tls(CX)
   205  	MOVQ	g(CX), BX
   206  	MOVQ	BX, gobuf_g(AX)
   207  	RET
   208  
   209  // void gogo(Gobuf*)
   210  // restore state from Gobuf; longjmp
   211  TEXT runtime·gogo(SB), NOSPLIT, $16-8
   212  	MOVQ	buf+0(FP), BX		// gobuf
   213  
   214  	// If ctxt is not nil, invoke deletion barrier before overwriting.
   215  	MOVQ	gobuf_ctxt(BX), AX
   216  	TESTQ	AX, AX
   217  	JZ	nilctxt
   218  	LEAQ	gobuf_ctxt(BX), AX
   219  	MOVQ	AX, 0(SP)
   220  	MOVQ	$0, 8(SP)
   221  	CALL	runtime·writebarrierptr_prewrite(SB)
   222  	MOVQ	buf+0(FP), BX
   223  
   224  nilctxt:
   225  	MOVQ	gobuf_g(BX), DX
   226  	MOVQ	0(DX), CX		// make sure g != nil
   227  	get_tls(CX)
   228  	MOVQ	DX, g(CX)
   229  	MOVQ	gobuf_sp(BX), SP	// restore SP
   230  	MOVQ	gobuf_ret(BX), AX
   231  	MOVQ	gobuf_ctxt(BX), DX
   232  	MOVQ	gobuf_bp(BX), BP
   233  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   234  	MOVQ	$0, gobuf_ret(BX)
   235  	MOVQ	$0, gobuf_ctxt(BX)
   236  	MOVQ	$0, gobuf_bp(BX)
   237  	MOVQ	gobuf_pc(BX), BX
   238  	JMP	BX
   239  
   240  // func mcall(fn func(*g))
   241  // Switch to m->g0's stack, call fn(g).
   242  // Fn must never return. It should gogo(&g->sched)
   243  // to keep running g.
   244  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   245  	MOVQ	fn+0(FP), DI
   246  	
   247  	get_tls(CX)
   248  	MOVQ	g(CX), AX	// save state in g->sched
   249  	MOVQ	0(SP), BX	// caller's PC
   250  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   251  	LEAQ	fn+0(FP), BX	// caller's SP
   252  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   253  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   254  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   255  
   256  	// switch to m->g0 & its stack, call fn
   257  	MOVQ	g(CX), BX
   258  	MOVQ	g_m(BX), BX
   259  	MOVQ	m_g0(BX), SI
   260  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   261  	JNE	3(PC)
   262  	MOVQ	$runtime·badmcall(SB), AX
   263  	JMP	AX
   264  	MOVQ	SI, g(CX)	// g = m->g0
   265  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   266  	PUSHQ	AX
   267  	MOVQ	DI, DX
   268  	MOVQ	0(DI), DI
   269  	CALL	DI
   270  	POPQ	AX
   271  	MOVQ	$runtime·badmcall2(SB), AX
   272  	JMP	AX
   273  	RET
   274  
   275  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   276  // of the G stack. We need to distinguish the routine that
   277  // lives at the bottom of the G stack from the one that lives
   278  // at the top of the system stack because the one at the top of
   279  // the system stack terminates the stack walk (see topofstack()).
   280  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   281  	RET
   282  
   283  // func systemstack(fn func())
   284  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   285  	MOVQ	fn+0(FP), DI	// DI = fn
   286  	get_tls(CX)
   287  	MOVQ	g(CX), AX	// AX = g
   288  	MOVQ	g_m(AX), BX	// BX = m
   289  
   290  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   291  	CMPQ	AX, DX
   292  	JEQ	noswitch
   293  
   294  	MOVQ	m_g0(BX), DX	// DX = g0
   295  	CMPQ	AX, DX
   296  	JEQ	noswitch
   297  
   298  	MOVQ	m_curg(BX), R8
   299  	CMPQ	AX, R8
   300  	JEQ	switch
   301  	
   302  	// Bad: g is not gsignal, not g0, not curg. What is it?
   303  	MOVQ	$runtime·badsystemstack(SB), AX
   304  	CALL	AX
   305  
   306  switch:
   307  	// save our state in g->sched. Pretend to
   308  	// be systemstack_switch if the G stack is scanned.
   309  	MOVQ	$runtime·systemstack_switch(SB), SI
   310  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   311  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   312  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   313  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   314  
   315  	// switch to g0
   316  	MOVQ	DX, g(CX)
   317  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   318  	// make it look like mstart called systemstack on g0, to stop traceback
   319  	SUBQ	$8, BX
   320  	MOVQ	$runtime·mstart(SB), DX
   321  	MOVQ	DX, 0(BX)
   322  	MOVQ	BX, SP
   323  
   324  	// call target function
   325  	MOVQ	DI, DX
   326  	MOVQ	0(DI), DI
   327  	CALL	DI
   328  
   329  	// switch back to g
   330  	get_tls(CX)
   331  	MOVQ	g(CX), AX
   332  	MOVQ	g_m(AX), BX
   333  	MOVQ	m_curg(BX), AX
   334  	MOVQ	AX, g(CX)
   335  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   336  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   337  	RET
   338  
   339  noswitch:
   340  	// already on m stack, just call directly
   341  	MOVQ	DI, DX
   342  	MOVQ	0(DI), DI
   343  	CALL	DI
   344  	RET
   345  
   346  /*
   347   * support for morestack
   348   */
   349  
   350  // Called during function prolog when more stack is needed.
   351  //
   352  // The traceback routines see morestack on a g0 as being
   353  // the top of a stack (for example, morestack calling newstack
   354  // calling the scheduler calling newm calling gc), so we must
   355  // record an argument size. For that purpose, it has no arguments.
   356  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   357  	// Cannot grow scheduler stack (m->g0).
   358  	get_tls(CX)
   359  	MOVQ	g(CX), BX
   360  	MOVQ	g_m(BX), BX
   361  	MOVQ	m_g0(BX), SI
   362  	CMPQ	g(CX), SI
   363  	JNE	3(PC)
   364  	CALL	runtime·badmorestackg0(SB)
   365  	INT	$3
   366  
   367  	// Cannot grow signal stack (m->gsignal).
   368  	MOVQ	m_gsignal(BX), SI
   369  	CMPQ	g(CX), SI
   370  	JNE	3(PC)
   371  	CALL	runtime·badmorestackgsignal(SB)
   372  	INT	$3
   373  
   374  	// Called from f.
   375  	// Set m->morebuf to f's caller.
   376  	MOVQ	8(SP), AX	// f's caller's PC
   377  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   378  	LEAQ	16(SP), AX	// f's caller's SP
   379  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   380  	get_tls(CX)
   381  	MOVQ	g(CX), SI
   382  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   383  
   384  	// Set g->sched to context in f.
   385  	MOVQ	0(SP), AX // f's PC
   386  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   387  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   388  	LEAQ	8(SP), AX // f's SP
   389  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   390  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   391  	// newstack will fill gobuf.ctxt.
   392  
   393  	// Call newstack on m->g0's stack.
   394  	MOVQ	m_g0(BX), BX
   395  	MOVQ	BX, g(CX)
   396  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   397  	PUSHQ	DX	// ctxt argument
   398  	CALL	runtime·newstack(SB)
   399  	MOVQ	$0, 0x1003	// crash if newstack returns
   400  	POPQ	DX	// keep balance check happy
   401  	RET
   402  
   403  // morestack but not preserving ctxt.
   404  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   405  	MOVL	$0, DX
   406  	JMP	runtime·morestack(SB)
   407  
   408  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   409  	// We came here via a RET to an overwritten return PC.
   410  	// AX may be live. Other registers are available.
   411  
   412  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   413  	get_tls(CX)
   414  	MOVQ	g(CX), CX
   415  	MOVQ	(g_stkbar+slice_array)(CX), DX
   416  	MOVQ	g_stkbarPos(CX), BX
   417  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   418  	MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
   419  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   420  	// Assert that we're popping the right saved LR.
   421  	ADDQ	$8, R8
   422  	CMPQ	R8, SP
   423  	JEQ	2(PC)
   424  	MOVL	$0, 0
   425  	// Record that this stack barrier was hit.
   426  	ADDQ	$1, g_stkbarPos(CX)
   427  	// Jump to the original return PC.
   428  	JMP	BX
   429  
   430  // reflectcall: call a function with the given argument list
   431  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   432  // we don't have variable-sized frames, so we use a small number
   433  // of constant-sized-frame functions to encode a few bits of size in the pc.
   434  // Caution: ugly multiline assembly macros in your future!
   435  
   436  #define DISPATCH(NAME,MAXSIZE)		\
   437  	CMPQ	CX, $MAXSIZE;		\
   438  	JA	3(PC);			\
   439  	MOVQ	$NAME(SB), AX;		\
   440  	JMP	AX
   441  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   442  
   443  TEXT reflect·call(SB), NOSPLIT, $0-0
   444  	JMP	·reflectcall(SB)
   445  
   446  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   447  	MOVLQZX argsize+24(FP), CX
   448  	DISPATCH(runtime·call32, 32)
   449  	DISPATCH(runtime·call64, 64)
   450  	DISPATCH(runtime·call128, 128)
   451  	DISPATCH(runtime·call256, 256)
   452  	DISPATCH(runtime·call512, 512)
   453  	DISPATCH(runtime·call1024, 1024)
   454  	DISPATCH(runtime·call2048, 2048)
   455  	DISPATCH(runtime·call4096, 4096)
   456  	DISPATCH(runtime·call8192, 8192)
   457  	DISPATCH(runtime·call16384, 16384)
   458  	DISPATCH(runtime·call32768, 32768)
   459  	DISPATCH(runtime·call65536, 65536)
   460  	DISPATCH(runtime·call131072, 131072)
   461  	DISPATCH(runtime·call262144, 262144)
   462  	DISPATCH(runtime·call524288, 524288)
   463  	DISPATCH(runtime·call1048576, 1048576)
   464  	DISPATCH(runtime·call2097152, 2097152)
   465  	DISPATCH(runtime·call4194304, 4194304)
   466  	DISPATCH(runtime·call8388608, 8388608)
   467  	DISPATCH(runtime·call16777216, 16777216)
   468  	DISPATCH(runtime·call33554432, 33554432)
   469  	DISPATCH(runtime·call67108864, 67108864)
   470  	DISPATCH(runtime·call134217728, 134217728)
   471  	DISPATCH(runtime·call268435456, 268435456)
   472  	DISPATCH(runtime·call536870912, 536870912)
   473  	DISPATCH(runtime·call1073741824, 1073741824)
   474  	MOVQ	$runtime·badreflectcall(SB), AX
   475  	JMP	AX
   476  
   477  #define CALLFN(NAME,MAXSIZE)			\
   478  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   479  	NO_LOCAL_POINTERS;			\
   480  	/* copy arguments to stack */		\
   481  	MOVQ	argptr+16(FP), SI;		\
   482  	MOVLQZX argsize+24(FP), CX;		\
   483  	MOVQ	SP, DI;				\
   484  	REP;MOVSB;				\
   485  	/* call function */			\
   486  	MOVQ	f+8(FP), DX;			\
   487  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   488  	CALL	(DX);				\
   489  	/* copy return values back */		\
   490  	MOVQ	argtype+0(FP), DX;		\
   491  	MOVQ	argptr+16(FP), DI;		\
   492  	MOVLQZX	argsize+24(FP), CX;		\
   493  	MOVLQZX	retoffset+28(FP), BX;		\
   494  	MOVQ	SP, SI;				\
   495  	ADDQ	BX, DI;				\
   496  	ADDQ	BX, SI;				\
   497  	SUBQ	BX, CX;				\
   498  	CALL	callRet<>(SB);			\
   499  	RET
   500  
   501  // callRet copies return values back at the end of call*. This is a
   502  // separate function so it can allocate stack space for the arguments
   503  // to reflectcallmove. It does not follow the Go ABI; it expects its
   504  // arguments in registers.
   505  TEXT callRet<>(SB), NOSPLIT, $32-0
   506  	NO_LOCAL_POINTERS
   507  	MOVQ	DX, 0(SP)
   508  	MOVQ	DI, 8(SP)
   509  	MOVQ	SI, 16(SP)
   510  	MOVQ	CX, 24(SP)
   511  	CALL	runtime·reflectcallmove(SB)
   512  	RET
   513  
   514  CALLFN(·call32, 32)
   515  CALLFN(·call64, 64)
   516  CALLFN(·call128, 128)
   517  CALLFN(·call256, 256)
   518  CALLFN(·call512, 512)
   519  CALLFN(·call1024, 1024)
   520  CALLFN(·call2048, 2048)
   521  CALLFN(·call4096, 4096)
   522  CALLFN(·call8192, 8192)
   523  CALLFN(·call16384, 16384)
   524  CALLFN(·call32768, 32768)
   525  CALLFN(·call65536, 65536)
   526  CALLFN(·call131072, 131072)
   527  CALLFN(·call262144, 262144)
   528  CALLFN(·call524288, 524288)
   529  CALLFN(·call1048576, 1048576)
   530  CALLFN(·call2097152, 2097152)
   531  CALLFN(·call4194304, 4194304)
   532  CALLFN(·call8388608, 8388608)
   533  CALLFN(·call16777216, 16777216)
   534  CALLFN(·call33554432, 33554432)
   535  CALLFN(·call67108864, 67108864)
   536  CALLFN(·call134217728, 134217728)
   537  CALLFN(·call268435456, 268435456)
   538  CALLFN(·call536870912, 536870912)
   539  CALLFN(·call1073741824, 1073741824)
   540  
   541  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   542  	MOVL	cycles+0(FP), AX
   543  again:
   544  	PAUSE
   545  	SUBL	$1, AX
   546  	JNZ	again
   547  	RET
   548  
   549  
   550  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   551  	// Stores are already ordered on x86, so this is just a
   552  	// compile barrier.
   553  	RET
   554  
   555  // void jmpdefer(fn, sp);
   556  // called from deferreturn.
   557  // 1. pop the caller
   558  // 2. sub 5 bytes from the callers return
   559  // 3. jmp to the argument
   560  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   561  	MOVQ	fv+0(FP), DX	// fn
   562  	MOVQ	argp+8(FP), BX	// caller sp
   563  	LEAQ	-8(BX), SP	// caller sp after CALL
   564  	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
   565  	SUBQ	$5, (SP)	// return to CALL again
   566  	MOVQ	0(DX), BX
   567  	JMP	BX	// but first run the deferred function
   568  
   569  // Save state of caller into g->sched. Smashes R8, R9.
   570  TEXT gosave<>(SB),NOSPLIT,$0
   571  	get_tls(R8)
   572  	MOVQ	g(R8), R8
   573  	MOVQ	0(SP), R9
   574  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   575  	LEAQ	8(SP), R9
   576  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   577  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   578  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   579  	// Assert ctxt is zero. See func save.
   580  	MOVQ	(g_sched+gobuf_ctxt)(R8), R9
   581  	TESTQ	R9, R9
   582  	JZ	2(PC)
   583  	CALL	runtime·badctxt(SB)
   584  	RET
   585  
   586  // func asmcgocall(fn, arg unsafe.Pointer) int32
   587  // Call fn(arg) on the scheduler stack,
   588  // aligned appropriately for the gcc ABI.
   589  // See cgocall.go for more details.
   590  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   591  	MOVQ	fn+0(FP), AX
   592  	MOVQ	arg+8(FP), BX
   593  
   594  	MOVQ	SP, DX
   595  
   596  	// Figure out if we need to switch to m->g0 stack.
   597  	// We get called to create new OS threads too, and those
   598  	// come in on the m->g0 stack already.
   599  	get_tls(CX)
   600  	MOVQ	g(CX), R8
   601  	CMPQ	R8, $0
   602  	JEQ	nosave
   603  	MOVQ	g_m(R8), R8
   604  	MOVQ	m_g0(R8), SI
   605  	MOVQ	g(CX), DI
   606  	CMPQ	SI, DI
   607  	JEQ	nosave
   608  	MOVQ	m_gsignal(R8), SI
   609  	CMPQ	SI, DI
   610  	JEQ	nosave
   611  	
   612  	// Switch to system stack.
   613  	MOVQ	m_g0(R8), SI
   614  	CALL	gosave<>(SB)
   615  	MOVQ	SI, g(CX)
   616  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   617  
   618  	// Now on a scheduling stack (a pthread-created stack).
   619  	// Make sure we have enough room for 4 stack-backed fast-call
   620  	// registers as per windows amd64 calling convention.
   621  	SUBQ	$64, SP
   622  	ANDQ	$~15, SP	// alignment for gcc ABI
   623  	MOVQ	DI, 48(SP)	// save g
   624  	MOVQ	(g_stack+stack_hi)(DI), DI
   625  	SUBQ	DX, DI
   626  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   627  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   628  	MOVQ	BX, CX		// CX = first argument in Win64
   629  	CALL	AX
   630  
   631  	// Restore registers, g, stack pointer.
   632  	get_tls(CX)
   633  	MOVQ	48(SP), DI
   634  	MOVQ	(g_stack+stack_hi)(DI), SI
   635  	SUBQ	40(SP), SI
   636  	MOVQ	DI, g(CX)
   637  	MOVQ	SI, SP
   638  
   639  	MOVL	AX, ret+16(FP)
   640  	RET
   641  
   642  nosave:
   643  	// Running on a system stack, perhaps even without a g.
   644  	// Having no g can happen during thread creation or thread teardown
   645  	// (see needm/dropm on Solaris, for example).
   646  	// This code is like the above sequence but without saving/restoring g
   647  	// and without worrying about the stack moving out from under us
   648  	// (because we're on a system stack, not a goroutine stack).
   649  	// The above code could be used directly if already on a system stack,
   650  	// but then the only path through this code would be a rare case on Solaris.
   651  	// Using this code for all "already on system stack" calls exercises it more,
   652  	// which should help keep it correct.
   653  	SUBQ	$64, SP
   654  	ANDQ	$~15, SP
   655  	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
   656  	MOVQ	DX, 40(SP)	// save original stack pointer
   657  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   658  	MOVQ	BX, CX		// CX = first argument in Win64
   659  	CALL	AX
   660  	MOVQ	40(SP), SI	// restore original stack pointer
   661  	MOVQ	SI, SP
   662  	MOVL	AX, ret+16(FP)
   663  	RET
   664  
   665  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   666  // Turn the fn into a Go func (by taking its address) and call
   667  // cgocallback_gofunc.
   668  TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
   669  	LEAQ	fn+0(FP), AX
   670  	MOVQ	AX, 0(SP)
   671  	MOVQ	frame+8(FP), AX
   672  	MOVQ	AX, 8(SP)
   673  	MOVQ	framesize+16(FP), AX
   674  	MOVQ	AX, 16(SP)
   675  	MOVQ	ctxt+24(FP), AX
   676  	MOVQ	AX, 24(SP)
   677  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   678  	CALL	AX
   679  	RET
   680  
   681  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   682  // See cgocall.go for more details.
   683  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
   684  	NO_LOCAL_POINTERS
   685  
   686  	// If g is nil, Go did not create the current thread.
   687  	// Call needm to obtain one m for temporary use.
   688  	// In this case, we're running on the thread stack, so there's
   689  	// lots of space, but the linker doesn't know. Hide the call from
   690  	// the linker analysis by using an indirect call through AX.
   691  	get_tls(CX)
   692  #ifdef GOOS_windows
   693  	MOVL	$0, BX
   694  	CMPQ	CX, $0
   695  	JEQ	2(PC)
   696  #endif
   697  	MOVQ	g(CX), BX
   698  	CMPQ	BX, $0
   699  	JEQ	needm
   700  	MOVQ	g_m(BX), BX
   701  	MOVQ	BX, R8 // holds oldm until end of function
   702  	JMP	havem
   703  needm:
   704  	MOVQ	$0, 0(SP)
   705  	MOVQ	$runtime·needm(SB), AX
   706  	CALL	AX
   707  	MOVQ	0(SP), R8
   708  	get_tls(CX)
   709  	MOVQ	g(CX), BX
   710  	MOVQ	g_m(BX), BX
   711  	
   712  	// Set m->sched.sp = SP, so that if a panic happens
   713  	// during the function we are about to execute, it will
   714  	// have a valid SP to run on the g0 stack.
   715  	// The next few lines (after the havem label)
   716  	// will save this SP onto the stack and then write
   717  	// the same SP back to m->sched.sp. That seems redundant,
   718  	// but if an unrecovered panic happens, unwindm will
   719  	// restore the g->sched.sp from the stack location
   720  	// and then systemstack will try to use it. If we don't set it here,
   721  	// that restored SP will be uninitialized (typically 0) and
   722  	// will not be usable.
   723  	MOVQ	m_g0(BX), SI
   724  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   725  
   726  havem:
   727  	// Now there's a valid m, and we're running on its m->g0.
   728  	// Save current m->g0->sched.sp on stack and then set it to SP.
   729  	// Save current sp in m->g0->sched.sp in preparation for
   730  	// switch back to m->curg stack.
   731  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   732  	MOVQ	m_g0(BX), SI
   733  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   734  	MOVQ	AX, 0(SP)
   735  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   736  
   737  	// Switch to m->curg stack and call runtime.cgocallbackg.
   738  	// Because we are taking over the execution of m->curg
   739  	// but *not* resuming what had been running, we need to
   740  	// save that information (m->curg->sched) so we can restore it.
   741  	// We can restore m->curg->sched.sp easily, because calling
   742  	// runtime.cgocallbackg leaves SP unchanged upon return.
   743  	// To save m->curg->sched.pc, we push it onto the stack.
   744  	// This has the added benefit that it looks to the traceback
   745  	// routine like cgocallbackg is going to return to that
   746  	// PC (because the frame we allocate below has the same
   747  	// size as cgocallback_gofunc's frame declared above)
   748  	// so that the traceback will seamlessly trace back into
   749  	// the earlier calls.
   750  	//
   751  	// In the new goroutine, 8(SP) holds the saved R8.
   752  	MOVQ	m_curg(BX), SI
   753  	MOVQ	SI, g(CX)
   754  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   755  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   756  	MOVQ	BX, -8(DI)
   757  	// Compute the size of the frame, including return PC and, if
   758  	// GOEXPERIMENT=framepointer, the saved base pointer
   759  	MOVQ	ctxt+24(FP), BX
   760  	LEAQ	fv+0(FP), AX
   761  	SUBQ	SP, AX
   762  	SUBQ	AX, DI
   763  	MOVQ	DI, SP
   764  
   765  	MOVQ	R8, 8(SP)
   766  	MOVQ	BX, 0(SP)
   767  	CALL	runtime·cgocallbackg(SB)
   768  	MOVQ	8(SP), R8
   769  
   770  	// Compute the size of the frame again. FP and SP have
   771  	// completely different values here than they did above,
   772  	// but only their difference matters.
   773  	LEAQ	fv+0(FP), AX
   774  	SUBQ	SP, AX
   775  
   776  	// Restore g->sched (== m->curg->sched) from saved values.
   777  	get_tls(CX)
   778  	MOVQ	g(CX), SI
   779  	MOVQ	SP, DI
   780  	ADDQ	AX, DI
   781  	MOVQ	-8(DI), BX
   782  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   783  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   784  
   785  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   786  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   787  	// so we do not have to restore it.)
   788  	MOVQ	g(CX), BX
   789  	MOVQ	g_m(BX), BX
   790  	MOVQ	m_g0(BX), SI
   791  	MOVQ	SI, g(CX)
   792  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   793  	MOVQ	0(SP), AX
   794  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   795  	
   796  	// If the m on entry was nil, we called needm above to borrow an m
   797  	// for the duration of the call. Since the call is over, return it with dropm.
   798  	CMPQ	R8, $0
   799  	JNE 3(PC)
   800  	MOVQ	$runtime·dropm(SB), AX
   801  	CALL	AX
   802  
   803  	// Done!
   804  	RET
   805  
   806  // void setg(G*); set g. for use by needm.
   807  TEXT runtime·setg(SB), NOSPLIT, $0-8
   808  	MOVQ	gg+0(FP), BX
   809  #ifdef GOOS_windows
   810  	CMPQ	BX, $0
   811  	JNE	settls
   812  	MOVQ	$0, 0x28(GS)
   813  	RET
   814  settls:
   815  	MOVQ	g_m(BX), AX
   816  	LEAQ	m_tls(AX), AX
   817  	MOVQ	AX, 0x28(GS)
   818  #endif
   819  	get_tls(CX)
   820  	MOVQ	BX, g(CX)
   821  	RET
   822  
   823  // void setg_gcc(G*); set g called from gcc.
   824  TEXT setg_gcc<>(SB),NOSPLIT,$0
   825  	get_tls(AX)
   826  	MOVQ	DI, g(AX)
   827  	RET
   828  
   829  // check that SP is in range [g->stack.lo, g->stack.hi)
   830  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   831  	get_tls(CX)
   832  	MOVQ	g(CX), AX
   833  	CMPQ	(g_stack+stack_hi)(AX), SP
   834  	JHI	2(PC)
   835  	INT	$3
   836  	CMPQ	SP, (g_stack+stack_lo)(AX)
   837  	JHI	2(PC)
   838  	INT	$3
   839  	RET
   840  
   841  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   842  	MOVQ	argp+0(FP),AX		// addr of first arg
   843  	MOVQ	-8(AX),AX		// get calling pc
   844  	CMPQ	AX, runtime·stackBarrierPC(SB)
   845  	JNE	nobar
   846  	// Get original return PC.
   847  	CALL	runtime·nextBarrierPC(SB)
   848  	MOVQ	0(SP), AX
   849  nobar:
   850  	MOVQ	AX, ret+8(FP)
   851  	RET
   852  
   853  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   854  	MOVQ	argp+0(FP),AX		// addr of first arg
   855  	MOVQ	pc+8(FP), BX
   856  	MOVQ	-8(AX), CX
   857  	CMPQ	CX, runtime·stackBarrierPC(SB)
   858  	JEQ	setbar
   859  	MOVQ	BX, -8(AX)		// set calling pc
   860  	RET
   861  setbar:
   862  	// Set the stack barrier return PC.
   863  	MOVQ	BX, 0(SP)
   864  	CALL	runtime·setNextBarrierPC(SB)
   865  	RET
   866  
   867  // func cputicks() int64
   868  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   869  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   870  	JNE	mfence
   871  	LFENCE
   872  	JMP	done
   873  mfence:
   874  	MFENCE
   875  done:
   876  	RDTSC
   877  	SHLQ	$32, DX
   878  	ADDQ	DX, AX
   879  	MOVQ	AX, ret+0(FP)
   880  	RET
   881  
   882  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   883  // redirects to memhash(p, h, size) using the size
   884  // stored in the closure.
   885  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   886  	GO_ARGS
   887  	NO_LOCAL_POINTERS
   888  	MOVQ	p+0(FP), AX
   889  	MOVQ	h+8(FP), BX
   890  	MOVQ	8(DX), CX
   891  	MOVQ	AX, 0(SP)
   892  	MOVQ	BX, 8(SP)
   893  	MOVQ	CX, 16(SP)
   894  	CALL	runtime·memhash(SB)
   895  	MOVQ	24(SP), AX
   896  	MOVQ	AX, ret+16(FP)
   897  	RET
   898  
   899  // hash function using AES hardware instructions
   900  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   901  	MOVQ	p+0(FP), AX	// ptr to data
   902  	MOVQ	s+16(FP), CX	// size
   903  	LEAQ	ret+24(FP), DX
   904  	JMP	runtime·aeshashbody(SB)
   905  
   906  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   907  	MOVQ	p+0(FP), AX	// ptr to string struct
   908  	MOVQ	8(AX), CX	// length of string
   909  	MOVQ	(AX), AX	// string data
   910  	LEAQ	ret+16(FP), DX
   911  	JMP	runtime·aeshashbody(SB)
   912  
   913  // AX: data
   914  // CX: length
   915  // DX: address to put return value
   916  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   917  	// Fill an SSE register with our seeds.
   918  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
   919  	PINSRW	$4, CX, X0			// 16 bits of length
   920  	PSHUFHW $0, X0, X0			// repeat length 4 times total
   921  	MOVO	X0, X1				// save unscrambled seed
   922  	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
   923  	AESENC	X0, X0				// scramble seed
   924  
   925  	CMPQ	CX, $16
   926  	JB	aes0to15
   927  	JE	aes16
   928  	CMPQ	CX, $32
   929  	JBE	aes17to32
   930  	CMPQ	CX, $64
   931  	JBE	aes33to64
   932  	CMPQ	CX, $128
   933  	JBE	aes65to128
   934  	JMP	aes129plus
   935  
   936  aes0to15:
   937  	TESTQ	CX, CX
   938  	JE	aes0
   939  
   940  	ADDQ	$16, AX
   941  	TESTW	$0xff0, AX
   942  	JE	endofpage
   943  
   944  	// 16 bytes loaded at this address won't cross
   945  	// a page boundary, so we can load it directly.
   946  	MOVOU	-16(AX), X1
   947  	ADDQ	CX, CX
   948  	MOVQ	$masks<>(SB), AX
   949  	PAND	(AX)(CX*8), X1
   950  final1:
   951  	PXOR	X0, X1	// xor data with seed
   952  	AESENC	X1, X1	// scramble combo 3 times
   953  	AESENC	X1, X1
   954  	AESENC	X1, X1
   955  	MOVQ	X1, (DX)
   956  	RET
   957  
   958  endofpage:
   959  	// address ends in 1111xxxx. Might be up against
   960  	// a page boundary, so load ending at last byte.
   961  	// Then shift bytes down using pshufb.
   962  	MOVOU	-32(AX)(CX*1), X1
   963  	ADDQ	CX, CX
   964  	MOVQ	$shifts<>(SB), AX
   965  	PSHUFB	(AX)(CX*8), X1
   966  	JMP	final1
   967  
   968  aes0:
   969  	// Return scrambled input seed
   970  	AESENC	X0, X0
   971  	MOVQ	X0, (DX)
   972  	RET
   973  
   974  aes16:
   975  	MOVOU	(AX), X1
   976  	JMP	final1
   977  
   978  aes17to32:
   979  	// make second starting seed
   980  	PXOR	runtime·aeskeysched+16(SB), X1
   981  	AESENC	X1, X1
   982  	
   983  	// load data to be hashed
   984  	MOVOU	(AX), X2
   985  	MOVOU	-16(AX)(CX*1), X3
   986  
   987  	// xor with seed
   988  	PXOR	X0, X2
   989  	PXOR	X1, X3
   990  
   991  	// scramble 3 times
   992  	AESENC	X2, X2
   993  	AESENC	X3, X3
   994  	AESENC	X2, X2
   995  	AESENC	X3, X3
   996  	AESENC	X2, X2
   997  	AESENC	X3, X3
   998  
   999  	// combine results
  1000  	PXOR	X3, X2
  1001  	MOVQ	X2, (DX)
  1002  	RET
  1003  
  1004  aes33to64:
  1005  	// make 3 more starting seeds
  1006  	MOVO	X1, X2
  1007  	MOVO	X1, X3
  1008  	PXOR	runtime·aeskeysched+16(SB), X1
  1009  	PXOR	runtime·aeskeysched+32(SB), X2
  1010  	PXOR	runtime·aeskeysched+48(SB), X3
  1011  	AESENC	X1, X1
  1012  	AESENC	X2, X2
  1013  	AESENC	X3, X3
  1014  	
  1015  	MOVOU	(AX), X4
  1016  	MOVOU	16(AX), X5
  1017  	MOVOU	-32(AX)(CX*1), X6
  1018  	MOVOU	-16(AX)(CX*1), X7
  1019  
  1020  	PXOR	X0, X4
  1021  	PXOR	X1, X5
  1022  	PXOR	X2, X6
  1023  	PXOR	X3, X7
  1024  	
  1025  	AESENC	X4, X4
  1026  	AESENC	X5, X5
  1027  	AESENC	X6, X6
  1028  	AESENC	X7, X7
  1029  	
  1030  	AESENC	X4, X4
  1031  	AESENC	X5, X5
  1032  	AESENC	X6, X6
  1033  	AESENC	X7, X7
  1034  	
  1035  	AESENC	X4, X4
  1036  	AESENC	X5, X5
  1037  	AESENC	X6, X6
  1038  	AESENC	X7, X7
  1039  
  1040  	PXOR	X6, X4
  1041  	PXOR	X7, X5
  1042  	PXOR	X5, X4
  1043  	MOVQ	X4, (DX)
  1044  	RET
  1045  
  1046  aes65to128:
  1047  	// make 7 more starting seeds
  1048  	MOVO	X1, X2
  1049  	MOVO	X1, X3
  1050  	MOVO	X1, X4
  1051  	MOVO	X1, X5
  1052  	MOVO	X1, X6
  1053  	MOVO	X1, X7
  1054  	PXOR	runtime·aeskeysched+16(SB), X1
  1055  	PXOR	runtime·aeskeysched+32(SB), X2
  1056  	PXOR	runtime·aeskeysched+48(SB), X3
  1057  	PXOR	runtime·aeskeysched+64(SB), X4
  1058  	PXOR	runtime·aeskeysched+80(SB), X5
  1059  	PXOR	runtime·aeskeysched+96(SB), X6
  1060  	PXOR	runtime·aeskeysched+112(SB), X7
  1061  	AESENC	X1, X1
  1062  	AESENC	X2, X2
  1063  	AESENC	X3, X3
  1064  	AESENC	X4, X4
  1065  	AESENC	X5, X5
  1066  	AESENC	X6, X6
  1067  	AESENC	X7, X7
  1068  
  1069  	// load data
  1070  	MOVOU	(AX), X8
  1071  	MOVOU	16(AX), X9
  1072  	MOVOU	32(AX), X10
  1073  	MOVOU	48(AX), X11
  1074  	MOVOU	-64(AX)(CX*1), X12
  1075  	MOVOU	-48(AX)(CX*1), X13
  1076  	MOVOU	-32(AX)(CX*1), X14
  1077  	MOVOU	-16(AX)(CX*1), X15
  1078  
  1079  	// xor with seed
  1080  	PXOR	X0, X8
  1081  	PXOR	X1, X9
  1082  	PXOR	X2, X10
  1083  	PXOR	X3, X11
  1084  	PXOR	X4, X12
  1085  	PXOR	X5, X13
  1086  	PXOR	X6, X14
  1087  	PXOR	X7, X15
  1088  
  1089  	// scramble 3 times
  1090  	AESENC	X8, X8
  1091  	AESENC	X9, X9
  1092  	AESENC	X10, X10
  1093  	AESENC	X11, X11
  1094  	AESENC	X12, X12
  1095  	AESENC	X13, X13
  1096  	AESENC	X14, X14
  1097  	AESENC	X15, X15
  1098  
  1099  	AESENC	X8, X8
  1100  	AESENC	X9, X9
  1101  	AESENC	X10, X10
  1102  	AESENC	X11, X11
  1103  	AESENC	X12, X12
  1104  	AESENC	X13, X13
  1105  	AESENC	X14, X14
  1106  	AESENC	X15, X15
  1107  
  1108  	AESENC	X8, X8
  1109  	AESENC	X9, X9
  1110  	AESENC	X10, X10
  1111  	AESENC	X11, X11
  1112  	AESENC	X12, X12
  1113  	AESENC	X13, X13
  1114  	AESENC	X14, X14
  1115  	AESENC	X15, X15
  1116  
  1117  	// combine results
  1118  	PXOR	X12, X8
  1119  	PXOR	X13, X9
  1120  	PXOR	X14, X10
  1121  	PXOR	X15, X11
  1122  	PXOR	X10, X8
  1123  	PXOR	X11, X9
  1124  	PXOR	X9, X8
  1125  	MOVQ	X8, (DX)
  1126  	RET
  1127  
  1128  aes129plus:
  1129  	// make 7 more starting seeds
  1130  	MOVO	X1, X2
  1131  	MOVO	X1, X3
  1132  	MOVO	X1, X4
  1133  	MOVO	X1, X5
  1134  	MOVO	X1, X6
  1135  	MOVO	X1, X7
  1136  	PXOR	runtime·aeskeysched+16(SB), X1
  1137  	PXOR	runtime·aeskeysched+32(SB), X2
  1138  	PXOR	runtime·aeskeysched+48(SB), X3
  1139  	PXOR	runtime·aeskeysched+64(SB), X4
  1140  	PXOR	runtime·aeskeysched+80(SB), X5
  1141  	PXOR	runtime·aeskeysched+96(SB), X6
  1142  	PXOR	runtime·aeskeysched+112(SB), X7
  1143  	AESENC	X1, X1
  1144  	AESENC	X2, X2
  1145  	AESENC	X3, X3
  1146  	AESENC	X4, X4
  1147  	AESENC	X5, X5
  1148  	AESENC	X6, X6
  1149  	AESENC	X7, X7
  1150  	
  1151  	// start with last (possibly overlapping) block
  1152  	MOVOU	-128(AX)(CX*1), X8
  1153  	MOVOU	-112(AX)(CX*1), X9
  1154  	MOVOU	-96(AX)(CX*1), X10
  1155  	MOVOU	-80(AX)(CX*1), X11
  1156  	MOVOU	-64(AX)(CX*1), X12
  1157  	MOVOU	-48(AX)(CX*1), X13
  1158  	MOVOU	-32(AX)(CX*1), X14
  1159  	MOVOU	-16(AX)(CX*1), X15
  1160  
  1161  	// xor in seed
  1162  	PXOR	X0, X8
  1163  	PXOR	X1, X9
  1164  	PXOR	X2, X10
  1165  	PXOR	X3, X11
  1166  	PXOR	X4, X12
  1167  	PXOR	X5, X13
  1168  	PXOR	X6, X14
  1169  	PXOR	X7, X15
  1170  	
  1171  	// compute number of remaining 128-byte blocks
  1172  	DECQ	CX
  1173  	SHRQ	$7, CX
  1174  	
  1175  aesloop:
  1176  	// scramble state
  1177  	AESENC	X8, X8
  1178  	AESENC	X9, X9
  1179  	AESENC	X10, X10
  1180  	AESENC	X11, X11
  1181  	AESENC	X12, X12
  1182  	AESENC	X13, X13
  1183  	AESENC	X14, X14
  1184  	AESENC	X15, X15
  1185  
  1186  	// scramble state, xor in a block
  1187  	MOVOU	(AX), X0
  1188  	MOVOU	16(AX), X1
  1189  	MOVOU	32(AX), X2
  1190  	MOVOU	48(AX), X3
  1191  	AESENC	X0, X8
  1192  	AESENC	X1, X9
  1193  	AESENC	X2, X10
  1194  	AESENC	X3, X11
  1195  	MOVOU	64(AX), X4
  1196  	MOVOU	80(AX), X5
  1197  	MOVOU	96(AX), X6
  1198  	MOVOU	112(AX), X7
  1199  	AESENC	X4, X12
  1200  	AESENC	X5, X13
  1201  	AESENC	X6, X14
  1202  	AESENC	X7, X15
  1203  
  1204  	ADDQ	$128, AX
  1205  	DECQ	CX
  1206  	JNE	aesloop
  1207  
  1208  	// 3 more scrambles to finish
  1209  	AESENC	X8, X8
  1210  	AESENC	X9, X9
  1211  	AESENC	X10, X10
  1212  	AESENC	X11, X11
  1213  	AESENC	X12, X12
  1214  	AESENC	X13, X13
  1215  	AESENC	X14, X14
  1216  	AESENC	X15, X15
  1217  	AESENC	X8, X8
  1218  	AESENC	X9, X9
  1219  	AESENC	X10, X10
  1220  	AESENC	X11, X11
  1221  	AESENC	X12, X12
  1222  	AESENC	X13, X13
  1223  	AESENC	X14, X14
  1224  	AESENC	X15, X15
  1225  	AESENC	X8, X8
  1226  	AESENC	X9, X9
  1227  	AESENC	X10, X10
  1228  	AESENC	X11, X11
  1229  	AESENC	X12, X12
  1230  	AESENC	X13, X13
  1231  	AESENC	X14, X14
  1232  	AESENC	X15, X15
  1233  
  1234  	PXOR	X12, X8
  1235  	PXOR	X13, X9
  1236  	PXOR	X14, X10
  1237  	PXOR	X15, X11
  1238  	PXOR	X10, X8
  1239  	PXOR	X11, X9
  1240  	PXOR	X9, X8
  1241  	MOVQ	X8, (DX)
  1242  	RET
  1243  	
  1244  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1245  	MOVQ	p+0(FP), AX	// ptr to data
  1246  	MOVQ	h+8(FP), X0	// seed
  1247  	PINSRD	$2, (AX), X0	// data
  1248  	AESENC	runtime·aeskeysched+0(SB), X0
  1249  	AESENC	runtime·aeskeysched+16(SB), X0
  1250  	AESENC	runtime·aeskeysched+32(SB), X0
  1251  	MOVQ	X0, ret+16(FP)
  1252  	RET
  1253  
  1254  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1255  	MOVQ	p+0(FP), AX	// ptr to data
  1256  	MOVQ	h+8(FP), X0	// seed
  1257  	PINSRQ	$1, (AX), X0	// data
  1258  	AESENC	runtime·aeskeysched+0(SB), X0
  1259  	AESENC	runtime·aeskeysched+16(SB), X0
  1260  	AESENC	runtime·aeskeysched+32(SB), X0
  1261  	MOVQ	X0, ret+16(FP)
  1262  	RET
  1263  
  1264  // simple mask to get rid of data in the high part of the register.
  1265  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1266  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1267  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1268  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1269  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1270  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1271  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1272  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1273  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1274  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1275  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1276  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1277  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1278  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1279  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1280  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1281  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1282  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1283  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1284  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1285  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1286  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1287  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1288  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1289  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1290  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1291  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1292  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1293  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1294  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1295  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1296  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1297  GLOBL masks<>(SB),RODATA,$256
  1298  
  1299  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1300  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1301  	MOVQ	$masks<>(SB), AX
  1302  	MOVQ	$shifts<>(SB), BX
  1303  	ORQ	BX, AX
  1304  	TESTQ	$15, AX
  1305  	SETEQ	ret+0(FP)
  1306  	RET
  1307  
  1308  // these are arguments to pshufb. They move data down from
  1309  // the high bytes of the register to the low bytes of the register.
  1310  // index is how many bytes to move.
  1311  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1312  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1313  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1314  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1315  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1316  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1317  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1318  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1319  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1320  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1321  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1322  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1323  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1324  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1325  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1326  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1327  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1328  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1329  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1330  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1331  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1332  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1333  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1334  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1335  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1336  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1337  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1338  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1339  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1340  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1341  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1342  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1343  GLOBL shifts<>(SB),RODATA,$256
  1344  
  1345  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1346  TEXT runtime·memequal(SB),NOSPLIT,$0-25
  1347  	MOVQ	a+0(FP), SI
  1348  	MOVQ	b+8(FP), DI
  1349  	CMPQ	SI, DI
  1350  	JEQ	eq
  1351  	MOVQ	size+16(FP), BX
  1352  	LEAQ	ret+24(FP), AX
  1353  	JMP	runtime·memeqbody(SB)
  1354  eq:
  1355  	MOVB	$1, ret+24(FP)
  1356  	RET
  1357  
  1358  // memequal_varlen(a, b unsafe.Pointer) bool
  1359  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1360  	MOVQ	a+0(FP), SI
  1361  	MOVQ	b+8(FP), DI
  1362  	CMPQ	SI, DI
  1363  	JEQ	eq
  1364  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1365  	LEAQ	ret+16(FP), AX
  1366  	JMP	runtime·memeqbody(SB)
  1367  eq:
  1368  	MOVB	$1, ret+16(FP)
  1369  	RET
  1370  
  1371  // eqstring tests whether two strings are equal.
  1372  // The compiler guarantees that strings passed
  1373  // to eqstring have equal length.
  1374  // See runtime_test.go:eqstring_generic for
  1375  // equivalent Go code.
  1376  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1377  	MOVQ	s1_base+0(FP), SI
  1378  	MOVQ	s2_base+16(FP), DI
  1379  	CMPQ	SI, DI
  1380  	JEQ	eq
  1381  	MOVQ	s1_len+8(FP), BX
  1382  	LEAQ	ret+32(FP), AX
  1383  	JMP	runtime·memeqbody(SB)
  1384  eq:
  1385  	MOVB	$1, ret+32(FP)
  1386  	RET
  1387  
  1388  // a in SI
  1389  // b in DI
  1390  // count in BX
  1391  // address of result byte in AX
  1392  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1393  	CMPQ	BX, $8
  1394  	JB	small
  1395  	CMPQ	BX, $64
  1396  	JB	bigloop
  1397  	CMPB    runtime·support_avx2(SB), $1
  1398  	JE	hugeloop_avx2
  1399  	
  1400  	// 64 bytes at a time using xmm registers
  1401  hugeloop:
  1402  	CMPQ	BX, $64
  1403  	JB	bigloop
  1404  	MOVOU	(SI), X0
  1405  	MOVOU	(DI), X1
  1406  	MOVOU	16(SI), X2
  1407  	MOVOU	16(DI), X3
  1408  	MOVOU	32(SI), X4
  1409  	MOVOU	32(DI), X5
  1410  	MOVOU	48(SI), X6
  1411  	MOVOU	48(DI), X7
  1412  	PCMPEQB	X1, X0
  1413  	PCMPEQB	X3, X2
  1414  	PCMPEQB	X5, X4
  1415  	PCMPEQB	X7, X6
  1416  	PAND	X2, X0
  1417  	PAND	X6, X4
  1418  	PAND	X4, X0
  1419  	PMOVMSKB X0, DX
  1420  	ADDQ	$64, SI
  1421  	ADDQ	$64, DI
  1422  	SUBQ	$64, BX
  1423  	CMPL	DX, $0xffff
  1424  	JEQ	hugeloop
  1425  	MOVB	$0, (AX)
  1426  	RET
  1427  
  1428  	// 64 bytes at a time using ymm registers
  1429  hugeloop_avx2:
  1430  	CMPQ	BX, $64
  1431  	JB	bigloop_avx2
  1432  	VMOVDQU	(SI), Y0
  1433  	VMOVDQU	(DI), Y1
  1434  	VMOVDQU	32(SI), Y2
  1435  	VMOVDQU	32(DI), Y3
  1436  	VPCMPEQB	Y1, Y0, Y4
  1437  	VPCMPEQB	Y2, Y3, Y5
  1438  	VPAND	Y4, Y5, Y6
  1439  	VPMOVMSKB Y6, DX
  1440  	ADDQ	$64, SI
  1441  	ADDQ	$64, DI
  1442  	SUBQ	$64, BX
  1443  	CMPL	DX, $0xffffffff
  1444  	JEQ	hugeloop_avx2
  1445  	VZEROUPPER
  1446  	MOVB	$0, (AX)
  1447  	RET
  1448  
  1449  bigloop_avx2:
  1450  	VZEROUPPER
  1451  
  1452  	// 8 bytes at a time using 64-bit register
  1453  bigloop:
  1454  	CMPQ	BX, $8
  1455  	JBE	leftover
  1456  	MOVQ	(SI), CX
  1457  	MOVQ	(DI), DX
  1458  	ADDQ	$8, SI
  1459  	ADDQ	$8, DI
  1460  	SUBQ	$8, BX
  1461  	CMPQ	CX, DX
  1462  	JEQ	bigloop
  1463  	MOVB	$0, (AX)
  1464  	RET
  1465  
  1466  	// remaining 0-8 bytes
  1467  leftover:
  1468  	MOVQ	-8(SI)(BX*1), CX
  1469  	MOVQ	-8(DI)(BX*1), DX
  1470  	CMPQ	CX, DX
  1471  	SETEQ	(AX)
  1472  	RET
  1473  
  1474  small:
  1475  	CMPQ	BX, $0
  1476  	JEQ	equal
  1477  
  1478  	LEAQ	0(BX*8), CX
  1479  	NEGQ	CX
  1480  
  1481  	CMPB	SI, $0xf8
  1482  	JA	si_high
  1483  
  1484  	// load at SI won't cross a page boundary.
  1485  	MOVQ	(SI), SI
  1486  	JMP	si_finish
  1487  si_high:
  1488  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
  1489  	MOVQ	-8(SI)(BX*1), SI
  1490  	SHRQ	CX, SI
  1491  si_finish:
  1492  
  1493  	// same for DI.
  1494  	CMPB	DI, $0xf8
  1495  	JA	di_high
  1496  	MOVQ	(DI), DI
  1497  	JMP	di_finish
  1498  di_high:
  1499  	MOVQ	-8(DI)(BX*1), DI
  1500  	SHRQ	CX, DI
  1501  di_finish:
  1502  
  1503  	SUBQ	SI, DI
  1504  	SHLQ	CX, DI
  1505  equal:
  1506  	SETEQ	(AX)
  1507  	RET
  1508  
  1509  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1510  	MOVQ	s1_base+0(FP), SI
  1511  	MOVQ	s1_len+8(FP), BX
  1512  	MOVQ	s2_base+16(FP), DI
  1513  	MOVQ	s2_len+24(FP), DX
  1514  	LEAQ	ret+32(FP), R9
  1515  	JMP	runtime·cmpbody(SB)
  1516  
  1517  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1518  	MOVQ	s1+0(FP), SI
  1519  	MOVQ	s1+8(FP), BX
  1520  	MOVQ	s2+24(FP), DI
  1521  	MOVQ	s2+32(FP), DX
  1522  	LEAQ	res+48(FP), R9
  1523  	JMP	runtime·cmpbody(SB)
  1524  
  1525  // input:
  1526  //   SI = a
  1527  //   DI = b
  1528  //   BX = alen
  1529  //   DX = blen
  1530  //   R9 = address of output word (stores -1/0/1 here)
  1531  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1532  	CMPQ	SI, DI
  1533  	JEQ	allsame
  1534  	CMPQ	BX, DX
  1535  	MOVQ	DX, R8
  1536  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1537  	CMPQ	R8, $8
  1538  	JB	small
  1539  
  1540  	CMPQ	R8, $63
  1541  	JBE	loop
  1542  	CMPB    runtime·support_avx2(SB), $1
  1543  	JEQ     big_loop_avx2
  1544  	JMP	big_loop
  1545  loop:
  1546  	CMPQ	R8, $16
  1547  	JBE	_0through16
  1548  	MOVOU	(SI), X0
  1549  	MOVOU	(DI), X1
  1550  	PCMPEQB X0, X1
  1551  	PMOVMSKB X1, AX
  1552  	XORQ	$0xffff, AX	// convert EQ to NE
  1553  	JNE	diff16	// branch if at least one byte is not equal
  1554  	ADDQ	$16, SI
  1555  	ADDQ	$16, DI
  1556  	SUBQ	$16, R8
  1557  	JMP	loop
  1558  	
  1559  diff64:
  1560  	ADDQ	$48, SI
  1561  	ADDQ	$48, DI
  1562  	JMP	diff16
  1563  diff48:
  1564  	ADDQ	$32, SI
  1565  	ADDQ	$32, DI
  1566  	JMP	diff16
  1567  diff32:
  1568  	ADDQ	$16, SI
  1569  	ADDQ	$16, DI
  1570  	// AX = bit mask of differences
  1571  diff16:
  1572  	BSFQ	AX, BX	// index of first byte that differs
  1573  	XORQ	AX, AX
  1574  	MOVB	(SI)(BX*1), CX
  1575  	CMPB	CX, (DI)(BX*1)
  1576  	SETHI	AX
  1577  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1578  	MOVQ	AX, (R9)
  1579  	RET
  1580  
  1581  	// 0 through 16 bytes left, alen>=8, blen>=8
  1582  _0through16:
  1583  	CMPQ	R8, $8
  1584  	JBE	_0through8
  1585  	MOVQ	(SI), AX
  1586  	MOVQ	(DI), CX
  1587  	CMPQ	AX, CX
  1588  	JNE	diff8
  1589  _0through8:
  1590  	MOVQ	-8(SI)(R8*1), AX
  1591  	MOVQ	-8(DI)(R8*1), CX
  1592  	CMPQ	AX, CX
  1593  	JEQ	allsame
  1594  
  1595  	// AX and CX contain parts of a and b that differ.
  1596  diff8:
  1597  	BSWAPQ	AX	// reverse order of bytes
  1598  	BSWAPQ	CX
  1599  	XORQ	AX, CX
  1600  	BSRQ	CX, CX	// index of highest bit difference
  1601  	SHRQ	CX, AX	// move a's bit to bottom
  1602  	ANDQ	$1, AX	// mask bit
  1603  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1604  	MOVQ	AX, (R9)
  1605  	RET
  1606  
  1607  	// 0-7 bytes in common
  1608  small:
  1609  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1610  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1611  	JEQ	allsame
  1612  
  1613  	// load bytes of a into high bytes of AX
  1614  	CMPB	SI, $0xf8
  1615  	JA	si_high
  1616  	MOVQ	(SI), SI
  1617  	JMP	si_finish
  1618  si_high:
  1619  	MOVQ	-8(SI)(R8*1), SI
  1620  	SHRQ	CX, SI
  1621  si_finish:
  1622  	SHLQ	CX, SI
  1623  
  1624  	// load bytes of b in to high bytes of BX
  1625  	CMPB	DI, $0xf8
  1626  	JA	di_high
  1627  	MOVQ	(DI), DI
  1628  	JMP	di_finish
  1629  di_high:
  1630  	MOVQ	-8(DI)(R8*1), DI
  1631  	SHRQ	CX, DI
  1632  di_finish:
  1633  	SHLQ	CX, DI
  1634  
  1635  	BSWAPQ	SI	// reverse order of bytes
  1636  	BSWAPQ	DI
  1637  	XORQ	SI, DI	// find bit differences
  1638  	JEQ	allsame
  1639  	BSRQ	DI, CX	// index of highest bit difference
  1640  	SHRQ	CX, SI	// move a's bit to bottom
  1641  	ANDQ	$1, SI	// mask bit
  1642  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1643  	MOVQ	AX, (R9)
  1644  	RET
  1645  
  1646  allsame:
  1647  	XORQ	AX, AX
  1648  	XORQ	CX, CX
  1649  	CMPQ	BX, DX
  1650  	SETGT	AX	// 1 if alen > blen
  1651  	SETEQ	CX	// 1 if alen == blen
  1652  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1653  	MOVQ	AX, (R9)
  1654  	RET
  1655  
  1656  	// this works for >= 64 bytes of data.
  1657  big_loop:
  1658  	MOVOU	(SI), X0
  1659  	MOVOU	(DI), X1
  1660  	PCMPEQB X0, X1
  1661  	PMOVMSKB X1, AX
  1662  	XORQ	$0xffff, AX
  1663  	JNE	diff16
  1664  
  1665  	MOVOU	16(SI), X0
  1666  	MOVOU	16(DI), X1
  1667  	PCMPEQB X0, X1
  1668  	PMOVMSKB X1, AX
  1669  	XORQ	$0xffff, AX
  1670  	JNE	diff32
  1671  
  1672  	MOVOU	32(SI), X0
  1673  	MOVOU	32(DI), X1
  1674  	PCMPEQB X0, X1
  1675  	PMOVMSKB X1, AX
  1676  	XORQ	$0xffff, AX
  1677  	JNE	diff48
  1678  
  1679  	MOVOU	48(SI), X0
  1680  	MOVOU	48(DI), X1
  1681  	PCMPEQB X0, X1
  1682  	PMOVMSKB X1, AX
  1683  	XORQ	$0xffff, AX
  1684  	JNE	diff64
  1685  
  1686  	ADDQ	$64, SI
  1687  	ADDQ	$64, DI
  1688  	SUBQ	$64, R8
  1689  	CMPQ	R8, $64
  1690  	JBE	loop
  1691  	JMP	big_loop
  1692  
  1693  	// Compare 64-bytes per loop iteration.
  1694  	// Loop is unrolled and uses AVX2.
  1695  big_loop_avx2:
  1696  	VMOVDQU	(SI), Y2
  1697  	VMOVDQU	(DI), Y3
  1698  	VMOVDQU	32(SI), Y4
  1699  	VMOVDQU	32(DI), Y5
  1700  	VPCMPEQB Y2, Y3, Y0
  1701  	VPMOVMSKB Y0, AX
  1702  	XORL	$0xffffffff, AX
  1703  	JNE	diff32_avx2
  1704  	VPCMPEQB Y4, Y5, Y6
  1705  	VPMOVMSKB Y6, AX
  1706  	XORL	$0xffffffff, AX
  1707  	JNE	diff64_avx2
  1708  
  1709  	ADDQ	$64, SI
  1710  	ADDQ	$64, DI
  1711  	SUBQ	$64, R8
  1712  	CMPQ	R8, $64
  1713  	JB	big_loop_avx2_exit
  1714  	JMP	big_loop_avx2
  1715  
  1716  	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
  1717  diff32_avx2:
  1718  	VZEROUPPER
  1719  	JMP diff16
  1720  
  1721  	// Same as diff32_avx2, but for last 32 bytes.
  1722  diff64_avx2:
  1723  	VZEROUPPER
  1724  	JMP diff48
  1725  
  1726  	// For <64 bytes remainder jump to normal loop.
  1727  big_loop_avx2_exit:
  1728  	VZEROUPPER
  1729  	JMP loop
  1730  
  1731  
  1732  TEXT strings·supportAVX2(SB),NOSPLIT,$0-1
  1733  	MOVBLZX runtime·support_avx2(SB), AX
  1734  	MOVB AX, ret+0(FP)
  1735  	RET
  1736  
  1737  TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1
  1738  	MOVBLZX runtime·support_avx2(SB), AX
  1739  	MOVB AX, ret+0(FP)
  1740  	RET
  1741  
  1742  TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
  1743  	MOVQ s+0(FP), DI
  1744  	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
  1745  	MOVQ s_len+8(FP), DX
  1746  	MOVQ c+16(FP), BP
  1747  	MOVQ c_len+24(FP), AX
  1748  	MOVQ DI, R10
  1749  	LEAQ ret+32(FP), R11
  1750  	JMP  runtime·indexShortStr(SB)
  1751  
  1752  TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
  1753  	MOVQ s+0(FP), DI
  1754  	MOVQ s_len+8(FP), DX
  1755  	MOVQ c+24(FP), BP
  1756  	MOVQ c_len+32(FP), AX
  1757  	MOVQ DI, R10
  1758  	LEAQ ret+48(FP), R11
  1759  	JMP  runtime·indexShortStr(SB)
  1760  
  1761  // AX: length of string, that we are searching for
  1762  // DX: length of string, in which we are searching
  1763  // DI: pointer to string, in which we are searching
  1764  // BP: pointer to string, that we are searching for
  1765  // R11: address, where to put return value
  1766  TEXT runtime·indexShortStr(SB),NOSPLIT,$0
  1767  	CMPQ AX, DX
  1768  	JA fail
  1769  	CMPQ DX, $16
  1770  	JAE sse42
  1771  no_sse42:
  1772  	CMPQ AX, $2
  1773  	JA   _3_or_more
  1774  	MOVW (BP), BP
  1775  	LEAQ -1(DI)(DX*1), DX
  1776  loop2:
  1777  	MOVW (DI), SI
  1778  	CMPW SI,BP
  1779  	JZ success
  1780  	ADDQ $1,DI
  1781  	CMPQ DI,DX
  1782  	JB loop2
  1783  	JMP fail
  1784  _3_or_more:
  1785  	CMPQ AX, $3
  1786  	JA   _4_or_more
  1787  	MOVW 1(BP), BX
  1788  	MOVW (BP), BP
  1789  	LEAQ -2(DI)(DX*1), DX
  1790  loop3:
  1791  	MOVW (DI), SI
  1792  	CMPW SI,BP
  1793  	JZ   partial_success3
  1794  	ADDQ $1,DI
  1795  	CMPQ DI,DX
  1796  	JB loop3
  1797  	JMP fail
  1798  partial_success3:
  1799  	MOVW 1(DI), SI
  1800  	CMPW SI,BX
  1801  	JZ success
  1802  	ADDQ $1,DI
  1803  	CMPQ DI,DX
  1804  	JB loop3
  1805  	JMP fail
  1806  _4_or_more:
  1807  	CMPQ AX, $4
  1808  	JA   _5_or_more
  1809  	MOVL (BP), BP
  1810  	LEAQ -3(DI)(DX*1), DX
  1811  loop4:
  1812  	MOVL (DI), SI
  1813  	CMPL SI,BP
  1814  	JZ   success
  1815  	ADDQ $1,DI
  1816  	CMPQ DI,DX
  1817  	JB loop4
  1818  	JMP fail
  1819  _5_or_more:
  1820  	CMPQ AX, $7
  1821  	JA   _8_or_more
  1822  	LEAQ 1(DI)(DX*1), DX
  1823  	SUBQ AX, DX
  1824  	MOVL -4(BP)(AX*1), BX
  1825  	MOVL (BP), BP
  1826  loop5to7:
  1827  	MOVL (DI), SI
  1828  	CMPL SI,BP
  1829  	JZ   partial_success5to7
  1830  	ADDQ $1,DI
  1831  	CMPQ DI,DX
  1832  	JB loop5to7
  1833  	JMP fail
  1834  partial_success5to7:
  1835  	MOVL -4(AX)(DI*1), SI
  1836  	CMPL SI,BX
  1837  	JZ success
  1838  	ADDQ $1,DI
  1839  	CMPQ DI,DX
  1840  	JB loop5to7
  1841  	JMP fail
  1842  _8_or_more:
  1843  	CMPQ AX, $8
  1844  	JA   _9_or_more
  1845  	MOVQ (BP), BP
  1846  	LEAQ -7(DI)(DX*1), DX
  1847  loop8:
  1848  	MOVQ (DI), SI
  1849  	CMPQ SI,BP
  1850  	JZ   success
  1851  	ADDQ $1,DI
  1852  	CMPQ DI,DX
  1853  	JB loop8
  1854  	JMP fail
  1855  _9_or_more:
  1856  	CMPQ AX, $15
  1857  	JA   _16_or_more
  1858  	LEAQ 1(DI)(DX*1), DX
  1859  	SUBQ AX, DX
  1860  	MOVQ -8(BP)(AX*1), BX
  1861  	MOVQ (BP), BP
  1862  loop9to15:
  1863  	MOVQ (DI), SI
  1864  	CMPQ SI,BP
  1865  	JZ   partial_success9to15
  1866  	ADDQ $1,DI
  1867  	CMPQ DI,DX
  1868  	JB loop9to15
  1869  	JMP fail
  1870  partial_success9to15:
  1871  	MOVQ -8(AX)(DI*1), SI
  1872  	CMPQ SI,BX
  1873  	JZ success
  1874  	ADDQ $1,DI
  1875  	CMPQ DI,DX
  1876  	JB loop9to15
  1877  	JMP fail
  1878  _16_or_more:
  1879  	CMPQ AX, $16
  1880  	JA   _17_or_more
  1881  	MOVOU (BP), X1
  1882  	LEAQ -15(DI)(DX*1), DX
  1883  loop16:
  1884  	MOVOU (DI), X2
  1885  	PCMPEQB X1, X2
  1886  	PMOVMSKB X2, SI
  1887  	CMPQ  SI, $0xffff
  1888  	JE   success
  1889  	ADDQ $1,DI
  1890  	CMPQ DI,DX
  1891  	JB loop16
  1892  	JMP fail
  1893  _17_or_more:
  1894  	CMPQ AX, $31
  1895  	JA   _32_or_more
  1896  	LEAQ 1(DI)(DX*1), DX
  1897  	SUBQ AX, DX
  1898  	MOVOU -16(BP)(AX*1), X0
  1899  	MOVOU (BP), X1
  1900  loop17to31:
  1901  	MOVOU (DI), X2
  1902  	PCMPEQB X1,X2
  1903  	PMOVMSKB X2, SI
  1904  	CMPQ  SI, $0xffff
  1905  	JE   partial_success17to31
  1906  	ADDQ $1,DI
  1907  	CMPQ DI,DX
  1908  	JB loop17to31
  1909  	JMP fail
  1910  partial_success17to31:
  1911  	MOVOU -16(AX)(DI*1), X3
  1912  	PCMPEQB X0, X3
  1913  	PMOVMSKB X3, SI
  1914  	CMPQ  SI, $0xffff
  1915  	JE success
  1916  	ADDQ $1,DI
  1917  	CMPQ DI,DX
  1918  	JB loop17to31
  1919  	JMP fail
  1920  // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
  1921  // So no need to check cpuid
  1922  _32_or_more:
  1923  	CMPQ AX, $32
  1924  	JA   _33_to_63
  1925  	VMOVDQU (BP), Y1
  1926  	LEAQ -31(DI)(DX*1), DX
  1927  loop32:
  1928  	VMOVDQU (DI), Y2
  1929  	VPCMPEQB Y1, Y2, Y3
  1930  	VPMOVMSKB Y3, SI
  1931  	CMPL  SI, $0xffffffff
  1932  	JE   success_avx2
  1933  	ADDQ $1,DI
  1934  	CMPQ DI,DX
  1935  	JB loop32
  1936  	JMP fail_avx2
  1937  _33_to_63:
  1938  	LEAQ 1(DI)(DX*1), DX
  1939  	SUBQ AX, DX
  1940  	VMOVDQU -32(BP)(AX*1), Y0
  1941  	VMOVDQU (BP), Y1
  1942  loop33to63:
  1943  	VMOVDQU (DI), Y2
  1944  	VPCMPEQB Y1, Y2, Y3
  1945  	VPMOVMSKB Y3, SI
  1946  	CMPL  SI, $0xffffffff
  1947  	JE   partial_success33to63
  1948  	ADDQ $1,DI
  1949  	CMPQ DI,DX
  1950  	JB loop33to63
  1951  	JMP fail_avx2
  1952  partial_success33to63:
  1953  	VMOVDQU -32(AX)(DI*1), Y3
  1954  	VPCMPEQB Y0, Y3, Y4
  1955  	VPMOVMSKB Y4, SI
  1956  	CMPL  SI, $0xffffffff
  1957  	JE success_avx2
  1958  	ADDQ $1,DI
  1959  	CMPQ DI,DX
  1960  	JB loop33to63
  1961  fail_avx2:
  1962  	VZEROUPPER
  1963  fail:
  1964  	MOVQ $-1, (R11)
  1965  	RET
  1966  success_avx2:
  1967  	VZEROUPPER
  1968  	JMP success
  1969  sse42:
  1970  	MOVL runtime·cpuid_ecx(SB), CX
  1971  	ANDL $0x100000, CX
  1972  	JZ no_sse42
  1973  	CMPQ AX, $12
  1974  	// PCMPESTRI is slower than normal compare,
  1975  	// so using it makes sense only if we advance 4+ bytes per compare
  1976  	// This value was determined experimentally and is the ~same
  1977  	// on Nehalem (first with SSE42) and Haswell.
  1978  	JAE _9_or_more
  1979  	LEAQ 16(BP), SI
  1980  	TESTW $0xff0, SI
  1981  	JEQ no_sse42
  1982  	MOVOU (BP), X1
  1983  	LEAQ -15(DI)(DX*1), SI
  1984  	MOVQ $16, R9
  1985  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
  1986  loop_sse42:
  1987  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
  1988  	// for equality (bits 2,3 are 11)
  1989  	// result is not masked or inverted (bits 4,5 are 00)
  1990  	// and corresponds to first matching byte (bit 6 is 0)
  1991  	PCMPESTRI $0x0c, (DI), X1
  1992  	// CX == 16 means no match,
  1993  	// CX > R9 means partial match at the end of the string,
  1994  	// otherwise sep is at offset CX from X1 start
  1995  	CMPQ CX, R9
  1996  	JBE sse42_success
  1997  	ADDQ R9, DI
  1998  	CMPQ DI, SI
  1999  	JB loop_sse42
  2000  	PCMPESTRI $0x0c, -1(SI), X1
  2001  	CMPQ CX, R9
  2002  	JA fail
  2003  	LEAQ -1(SI), DI
  2004  sse42_success:
  2005  	ADDQ CX, DI
  2006  success:
  2007  	SUBQ R10, DI
  2008  	MOVQ DI, (R11)
  2009  	RET
  2010  
  2011  
  2012  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  2013  	MOVQ s+0(FP), SI
  2014  	MOVQ s_len+8(FP), BX
  2015  	MOVB c+24(FP), AL
  2016  	LEAQ ret+32(FP), R8
  2017  	JMP  runtime·indexbytebody(SB)
  2018  
  2019  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  2020  	MOVQ s+0(FP), SI
  2021  	MOVQ s_len+8(FP), BX
  2022  	MOVB c+16(FP), AL
  2023  	LEAQ ret+24(FP), R8
  2024  	JMP  runtime·indexbytebody(SB)
  2025  
  2026  // input:
  2027  //   SI: data
  2028  //   BX: data len
  2029  //   AL: byte sought
  2030  //   R8: address to put result
  2031  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  2032  	// Shuffle X0 around so that each byte contains
  2033  	// the character we're looking for.
  2034  	MOVD AX, X0
  2035  	PUNPCKLBW X0, X0
  2036  	PUNPCKLBW X0, X0
  2037  	PSHUFL $0, X0, X0
  2038  	
  2039  	CMPQ BX, $16
  2040  	JLT small
  2041  
  2042  	MOVQ SI, DI
  2043  
  2044  	CMPQ BX, $32
  2045  	JA avx2
  2046  sse:
  2047  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
  2048  	JMP	sseloopentry
  2049  	
  2050  sseloop:
  2051  	// Move the next 16-byte chunk of the data into X1.
  2052  	MOVOU	(DI), X1
  2053  	// Compare bytes in X0 to X1.
  2054  	PCMPEQB	X0, X1
  2055  	// Take the top bit of each byte in X1 and put the result in DX.
  2056  	PMOVMSKB X1, DX
  2057  	// Find first set bit, if any.
  2058  	BSFL	DX, DX
  2059  	JNZ	ssesuccess
  2060  	// Advance to next block.
  2061  	ADDQ	$16, DI
  2062  sseloopentry:
  2063  	CMPQ	DI, AX
  2064  	JB	sseloop
  2065  
  2066  	// Search the last 16-byte chunk. This chunk may overlap with the
  2067  	// chunks we've already searched, but that's ok.
  2068  	MOVQ	AX, DI
  2069  	MOVOU	(AX), X1
  2070  	PCMPEQB	X0, X1
  2071  	PMOVMSKB X1, DX
  2072  	BSFL	DX, DX
  2073  	JNZ	ssesuccess
  2074  
  2075  failure:
  2076  	MOVQ $-1, (R8)
  2077  	RET
  2078  
  2079  // We've found a chunk containing the byte.
  2080  // The chunk was loaded from DI.
  2081  // The index of the matching byte in the chunk is DX.
  2082  // The start of the data is SI.
  2083  ssesuccess:
  2084  	SUBQ SI, DI	// Compute offset of chunk within data.
  2085  	ADDQ DX, DI	// Add offset of byte within chunk.
  2086  	MOVQ DI, (R8)
  2087  	RET
  2088  
  2089  // handle for lengths < 16
  2090  small:
  2091  	TESTQ	BX, BX
  2092  	JEQ	failure
  2093  
  2094  	// Check if we'll load across a page boundary.
  2095  	LEAQ	16(SI), AX
  2096  	TESTW	$0xff0, AX
  2097  	JEQ	endofpage
  2098  
  2099  	MOVOU	(SI), X1 // Load data
  2100  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2101  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2102  	BSFL	DX, DX	// Find first set bit.
  2103  	JZ	failure	// No set bit, failure.
  2104  	CMPL	DX, BX
  2105  	JAE	failure	// Match is past end of data.
  2106  	MOVQ	DX, (R8)
  2107  	RET
  2108  
  2109  endofpage:
  2110  	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
  2111  	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
  2112  	PMOVMSKB X1, DX	// Move result bits to integer register.
  2113  	MOVL	BX, CX
  2114  	SHLL	CX, DX
  2115  	SHRL	$16, DX	// Shift desired bits down to bottom of register.
  2116  	BSFL	DX, DX	// Find first set bit.
  2117  	JZ	failure	// No set bit, failure.
  2118  	MOVQ	DX, (R8)
  2119  	RET
  2120  
  2121  avx2:
  2122  	CMPB   runtime·support_avx2(SB), $1
  2123  	JNE sse
  2124  	MOVD AX, X0
  2125  	LEAQ -32(SI)(BX*1), R11
  2126  	VPBROADCASTB  X0, Y1
  2127  avx2_loop:
  2128  	VMOVDQU (DI), Y2
  2129  	VPCMPEQB Y1, Y2, Y3
  2130  	VPTEST Y3, Y3
  2131  	JNZ avx2success
  2132  	ADDQ $32, DI
  2133  	CMPQ DI, R11
  2134  	JLT avx2_loop
  2135  	MOVQ R11, DI
  2136  	VMOVDQU (DI), Y2
  2137  	VPCMPEQB Y1, Y2, Y3
  2138  	VPTEST Y3, Y3
  2139  	JNZ avx2success
  2140  	VZEROUPPER
  2141  	MOVQ $-1, (R8)
  2142  	RET
  2143  
  2144  avx2success:
  2145  	VPMOVMSKB Y3, DX
  2146  	BSFL DX, DX
  2147  	SUBQ SI, DI
  2148  	ADDQ DI, DX
  2149  	MOVQ DX, (R8)
  2150  	VZEROUPPER
  2151  	RET
  2152  
  2153  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  2154  	MOVQ	a_len+8(FP), BX
  2155  	MOVQ	b_len+32(FP), CX
  2156  	CMPQ	BX, CX
  2157  	JNE	eqret
  2158  	MOVQ	a+0(FP), SI
  2159  	MOVQ	b+24(FP), DI
  2160  	LEAQ	ret+48(FP), AX
  2161  	JMP	runtime·memeqbody(SB)
  2162  eqret:
  2163  	MOVB	$0, ret+48(FP)
  2164  	RET
  2165  
  2166  TEXT runtime·fastrand(SB), NOSPLIT, $0-4
  2167  	get_tls(CX)
  2168  	MOVQ	g(CX), AX
  2169  	MOVQ	g_m(AX), AX
  2170  	MOVL	m_fastrand(AX), DX
  2171  	ADDL	DX, DX
  2172  	MOVL	DX, BX
  2173  	XORL	$0x88888eef, DX
  2174  	CMOVLMI	BX, DX
  2175  	MOVL	DX, m_fastrand(AX)
  2176  	MOVL	DX, ret+0(FP)
  2177  	RET
  2178  
  2179  TEXT runtime·return0(SB), NOSPLIT, $0
  2180  	MOVL	$0, AX
  2181  	RET
  2182  
  2183  
  2184  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  2185  // Must obey the gcc calling convention.
  2186  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  2187  	get_tls(CX)
  2188  	MOVQ	g(CX), AX
  2189  	MOVQ	g_m(AX), AX
  2190  	MOVQ	m_curg(AX), AX
  2191  	MOVQ	(g_stack+stack_hi)(AX), AX
  2192  	RET
  2193  
  2194  // The top-most function running on a goroutine
  2195  // returns to goexit+PCQuantum.
  2196  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  2197  	BYTE	$0x90	// NOP
  2198  	CALL	runtime·goexit1(SB)	// does not return
  2199  	// traceback from goexit1 must hit code range of goexit
  2200  	BYTE	$0x90	// NOP
  2201  
  2202  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  2203  	MOVQ	addr+0(FP), AX
  2204  	PREFETCHT0	(AX)
  2205  	RET
  2206  
  2207  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  2208  	MOVQ	addr+0(FP), AX
  2209  	PREFETCHT1	(AX)
  2210  	RET
  2211  
  2212  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  2213  	MOVQ	addr+0(FP), AX
  2214  	PREFETCHT2	(AX)
  2215  	RET
  2216  
  2217  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  2218  	MOVQ	addr+0(FP), AX
  2219  	PREFETCHNTA	(AX)
  2220  	RET
  2221  
  2222  // This is called from .init_array and follows the platform, not Go, ABI.
  2223  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  2224  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  2225  	MOVQ	runtime·lastmoduledatap(SB), AX
  2226  	MOVQ	DI, moduledata_next(AX)
  2227  	MOVQ	DI, runtime·lastmoduledatap(SB)
  2228  	POPQ	R15
  2229  	RET