github.com/emc-advanced-dev/unik@v0.0.0-20190717152701-a58d3e8e33b7/containers/compilers/rump/go/gopatches/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  
    11  TEXT runtime·ksyscall(SB),NOSPLIT,$0-56
    12      CALL	runtime·entersyscall(SB)
    13  
    14      MOVQ	fn+0(FP), DI	// DI = fn
    15      get_tls(CX)
    16      MOVQ	g(CX), AX	// AX = g
    17      MOVQ	g_m(AX), BX	// BX = m
    18      MOVQ	m_gsignal(BX), DX	// DX = gsignal
    19      MOVQ	m_g0(BX), DX	// DX = g0
    20      MOVQ	m_curg(BX), R8
    21  
    22      // save our state in g->sched.  Pretend to
    23      // be systemstack_switch if the G stack is scanned.
    24      MOVQ	$runtime·systemstack_switch(SB), SI
    25      MOVQ	SI, (g_sched+gobuf_pc)(AX)
    26      MOVQ	SP, (g_sched+gobuf_sp)(AX)
    27      MOVQ	AX, (g_sched+gobuf_g)(AX)
    28      MOVQ	BP, (g_sched+gobuf_bp)(AX)
    29  
    30      // switch to g0
    31      MOVQ	DX, g(CX)
    32      MOVQ	(g_sched+gobuf_sp)(DX), BX
    33      // make it look like mstart called systemstack on g0, to stop traceback
    34      SUBQ	$8, BX
    35      MOVQ	$runtime·mstart(SB), DX
    36      MOVQ	DX, 0(BX)
    37  
    38      // call target function
    39      MOVQ	8(SP), DI
    40      MOVQ	SP, SI
    41      ADDQ	$16, SI
    42      MOVQ	$0, DX		// dlen is ignored for local calls
    43      MOVQ	SP, CX
    44      ADDQ	$40, CX
    45  
    46      MOVQ	BX, SP // <- change stack just before call.
    47      LEAQ	rump_syscall(SB), AX
    48      CALL	AX
    49      MOVQ	AX, DX	// errno
    50  
    51      // SWITCH BACK
    52      get_tls(CX)
    53      MOVQ	g(CX), AX
    54      MOVQ	g_m(AX), BX
    55      MOVQ	m_curg(BX), AX
    56      MOVQ	AX, g(CX)
    57      MOVQ	(g_sched+gobuf_sp)(AX), SP
    58      MOVQ	$0, (g_sched+gobuf_sp)(AX)
    59  
    60      MOVQ	DX, 56(SP)	// errno
    61  
    62  
    63      CALL	runtime·exitsyscall(SB)
    64      RET
    65  
    66  
    67  
    68  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    69  	// copy arguments forward on an even stack
    70  	MOVQ	DI, AX		// argc
    71  	MOVQ	SI, BX		// argv
    72  	SUBQ	$(4*8+7), SP		// 2args 2auto
    73  	ANDQ	$~15, SP
    74  	MOVQ	AX, 16(SP)
    75  	MOVQ	BX, 24(SP)
    76  	
    77  	// create istack out of the given (operating system) stack.
    78  	// _cgo_init may update stackguard.
    79  	MOVQ	$runtime·g0(SB), DI
    80  	LEAQ	(-64*1024+104)(SP), BX
    81  	MOVQ	BX, g_stackguard0(DI)
    82  	MOVQ	BX, g_stackguard1(DI)
    83  	MOVQ	BX, (g_stack+stack_lo)(DI)
    84  	MOVQ	SP, (g_stack+stack_hi)(DI)
    85  
    86  	// find out information about the processor we're on
    87  	MOVQ	$0, AX
    88  	CPUID
    89  	CMPQ	AX, $0
    90  	JE	nocpuinfo
    91  
    92  	// Figure out how to serialize RDTSC.
    93  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    94  	// Don't know about the rest, so let's do MFENCE.
    95  	CMPL	BX, $0x756E6547  // "Genu"
    96  	JNE	notintel
    97  	CMPL	DX, $0x49656E69  // "ineI"
    98  	JNE	notintel
    99  	CMPL	CX, $0x6C65746E  // "ntel"
   100  	JNE	notintel
   101  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
   102  notintel:
   103  
   104  	MOVQ	$1, AX
   105  	CPUID
   106  	MOVL	CX, runtime·cpuid_ecx(SB)
   107  	MOVL	DX, runtime·cpuid_edx(SB)
   108  nocpuinfo:	
   109  	
   110  	// if there is an _cgo_init, call it.
   111  	MOVQ	_cgo_init(SB), AX
   112  	TESTQ	AX, AX
   113  	JZ	needtls
   114  	// g0 already in DI
   115  	MOVQ	DI, CX	// Win64 uses CX for first parameter
   116  	MOVQ	$setg_gcc<>(SB), SI
   117  	CALL	AX
   118  
   119  	// update stackguard after _cgo_init
   120  	MOVQ	$runtime·g0(SB), CX
   121  	MOVQ	(g_stack+stack_lo)(CX), AX
   122  	ADDQ	$const__StackGuard, AX
   123  	MOVQ	AX, g_stackguard0(CX)
   124  	MOVQ	AX, g_stackguard1(CX)
   125  
   126  	CMPL	runtime·iswindows(SB), $0
   127  	JEQ ok
   128  needtls:
   129  	// skip TLS setup on Plan 9
   130  	CMPL	runtime·isplan9(SB), $1
   131  	JEQ ok
   132  	// skip TLS setup on Solaris
   133  	CMPL	runtime·issolaris(SB), $1
   134  	JEQ ok
   135  
   136  	LEAQ	runtime·tls0(SB), DI
   137  	CALL	runtime·settls(SB)
   138  
   139  	// store through it, to make sure it works
   140  	get_tls(BX)
   141  	MOVQ	$0x123, g(BX)
   142  	MOVQ	runtime·tls0(SB), AX
   143  	CMPQ	AX, $0x123
   144  	JEQ 2(PC)
   145  	MOVL	AX, 0	// abort
   146  ok:
   147  	// set the per-goroutine and per-mach "registers"
   148  	get_tls(BX)
   149  	LEAQ	runtime·g0(SB), CX
   150  	MOVQ	CX, g(BX)
   151  	LEAQ	runtime·m0(SB), AX
   152  
   153  	// save m->g0 = g0
   154  	MOVQ	CX, m_g0(AX)
   155  	// save m0 to g0->m
   156  	MOVQ	AX, g_m(CX)
   157  
   158  	CLD				// convention is D is always left cleared
   159  	CALL	runtime·check(SB)
   160  
   161  	MOVL	16(SP), AX		// copy argc
   162  	MOVL	AX, 0(SP)
   163  	MOVQ	24(SP), AX		// copy argv
   164  	MOVQ	AX, 8(SP)
   165  	CALL	runtime·args(SB)
   166  	CALL	runtime·osinit(SB)
   167  	CALL	runtime·schedinit(SB)
   168  
   169  	// create a new goroutine to start program
   170  	MOVQ	$runtime·mainPC(SB), AX		// entry
   171  	PUSHQ	AX
   172  	PUSHQ	$0			// arg size
   173  	CALL	runtime·newproc(SB)
   174  	POPQ	AX
   175  	POPQ	AX
   176  
   177  	// start this M
   178  	CALL	runtime·mstart(SB)
   179  
   180  	MOVL	$0xf1, 0xf1  // crash
   181  	RET
   182  
   183  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   184  GLOBL	runtime·mainPC(SB),RODATA,$8
   185  
   186  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   187  	BYTE	$0xcc
   188  	RET
   189  
   190  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   191  	// No per-thread init.
   192  	RET
   193  
   194  /*
   195   *  go-routine
   196   */
   197  
   198  // void gosave(Gobuf*)
   199  // save state in Gobuf; setjmp
   200  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   201  	MOVQ	buf+0(FP), AX		// gobuf
   202  	LEAQ	buf+0(FP), BX		// caller's SP
   203  	MOVQ	BX, gobuf_sp(AX)
   204  	MOVQ	0(SP), BX		// caller's PC
   205  	MOVQ	BX, gobuf_pc(AX)
   206  	MOVQ	$0, gobuf_ret(AX)
   207  	MOVQ	$0, gobuf_ctxt(AX)
   208  	MOVQ	BP, gobuf_bp(AX)
   209  	get_tls(CX)
   210  	MOVQ	g(CX), BX
   211  	MOVQ	BX, gobuf_g(AX)
   212  	RET
   213  
   214  // void gogo(Gobuf*)
   215  // restore state from Gobuf; longjmp
   216  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   217  	MOVQ	buf+0(FP), BX		// gobuf
   218  	MOVQ	gobuf_g(BX), DX
   219  	MOVQ	0(DX), CX		// make sure g != nil
   220  	get_tls(CX)
   221  	MOVQ	DX, g(CX)
   222  	MOVQ	gobuf_sp(BX), SP	// restore SP
   223  	MOVQ	gobuf_ret(BX), AX
   224  	MOVQ	gobuf_ctxt(BX), DX
   225  	MOVQ	gobuf_bp(BX), BP
   226  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   227  	MOVQ	$0, gobuf_ret(BX)
   228  	MOVQ	$0, gobuf_ctxt(BX)
   229  	MOVQ	$0, gobuf_bp(BX)
   230  	MOVQ	gobuf_pc(BX), BX
   231  	JMP	BX
   232  
   233  // func mcall(fn func(*g))
   234  // Switch to m->g0's stack, call fn(g).
   235  // Fn must never return.  It should gogo(&g->sched)
   236  // to keep running g.
   237  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   238  	MOVQ	fn+0(FP), DI
   239  	
   240  	get_tls(CX)
   241  	MOVQ	g(CX), AX	// save state in g->sched
   242  	MOVQ	0(SP), BX	// caller's PC
   243  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   244  	LEAQ	fn+0(FP), BX	// caller's SP
   245  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   246  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   247  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   248  
   249  	// switch to m->g0 & its stack, call fn
   250  	MOVQ	g(CX), BX
   251  	MOVQ	g_m(BX), BX
   252  	MOVQ	m_g0(BX), SI
   253  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   254  	JNE	3(PC)
   255  	MOVQ	$runtime·badmcall(SB), AX
   256  	JMP	AX
   257  	MOVQ	SI, g(CX)	// g = m->g0
   258  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   259  	PUSHQ	AX
   260  	MOVQ	DI, DX
   261  	MOVQ	0(DI), DI
   262  	CALL	DI
   263  	POPQ	AX
   264  	MOVQ	$runtime·badmcall2(SB), AX
   265  	JMP	AX
   266  	RET
   267  
   268  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   269  // of the G stack.  We need to distinguish the routine that
   270  // lives at the bottom of the G stack from the one that lives
   271  // at the top of the system stack because the one at the top of
   272  // the system stack terminates the stack walk (see topofstack()).
   273  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   274  	RET
   275  
   276  // func systemstack(fn func())
   277  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   278  	MOVQ	fn+0(FP), DI	// DI = fn
   279  	get_tls(CX)
   280  	MOVQ	g(CX), AX	// AX = g
   281  	MOVQ	g_m(AX), BX	// BX = m
   282  
   283  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   284  	CMPQ	AX, DX
   285  	JEQ	noswitch
   286  
   287  	MOVQ	m_g0(BX), DX	// DX = g0
   288  	CMPQ	AX, DX
   289  	JEQ	noswitch
   290  
   291  	MOVQ	m_curg(BX), R8
   292  	CMPQ	AX, R8
   293  	JEQ	switch
   294  
   295  	// Bad: g is not gsignal, not g0, not curg. What is it?
   296  	MOVQ	$runtime·badsystemstack(SB), AX
   297  	CALL	AX
   298  
   299  switch:
   300  	// save our state in g->sched.  Pretend to
   301  	// be systemstack_switch if the G stack is scanned.
   302  	MOVQ	$runtime·systemstack_switch(SB), SI
   303  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   304  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   305  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   306  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   307  
   308  	// switch to g0
   309  	MOVQ	DX, g(CX)
   310  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   311  	// make it look like mstart called systemstack on g0, to stop traceback
   312  	SUBQ	$8, BX
   313  	MOVQ	$runtime·mstart(SB), DX
   314  	MOVQ	DX, 0(BX)
   315  	MOVQ	BX, SP
   316  
   317  	// call target function
   318  	MOVQ	DI, DX
   319  	MOVQ	0(DI), DI
   320  	CALL	DI
   321  
   322  	// switch back to g
   323  	get_tls(CX)
   324  	MOVQ	g(CX), AX
   325  	MOVQ	g_m(AX), BX
   326  	MOVQ	m_curg(BX), AX
   327  	MOVQ	AX, g(CX)
   328  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   329  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   330  	RET
   331  
   332  noswitch:
   333  	// already on m stack, just call directly
   334  	MOVQ	DI, DX
   335  	MOVQ	0(DI), DI
   336  	CALL	DI
   337  	RET
   338  
   339  /*
   340   * support for morestack
   341   */
   342  
   343  // Called during function prolog when more stack is needed.
   344  //
   345  // The traceback routines see morestack on a g0 as being
   346  // the top of a stack (for example, morestack calling newstack
   347  // calling the scheduler calling newm calling gc), so we must
   348  // record an argument size. For that purpose, it has no arguments.
   349  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   350  	// Cannot grow scheduler stack (m->g0).
   351  	get_tls(CX)
   352  	MOVQ	g(CX), BX
   353  	MOVQ	g_m(BX), BX
   354  	MOVQ	m_g0(BX), SI
   355  	CMPQ	g(CX), SI
   356  	JNE	2(PC)
   357  	INT	$3
   358  
   359  	// Cannot grow signal stack (m->gsignal).
   360  	MOVQ	m_gsignal(BX), SI
   361  	CMPQ	g(CX), SI
   362  	JNE	2(PC)
   363  	INT	$3
   364  
   365  	// Called from f.
   366  	// Set m->morebuf to f's caller.
   367  	MOVQ	8(SP), AX	// f's caller's PC
   368  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   369  	LEAQ	16(SP), AX	// f's caller's SP
   370  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   371  	get_tls(CX)
   372  	MOVQ	g(CX), SI
   373  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   374  
   375  	// Set g->sched to context in f.
   376  	MOVQ	0(SP), AX // f's PC
   377  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   378  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   379  	LEAQ	8(SP), AX // f's SP
   380  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   381  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   382  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   383  
   384  	// Call newstack on m->g0's stack.
   385  	MOVQ	m_g0(BX), BX
   386  	MOVQ	BX, g(CX)
   387  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   388  	CALL	runtime·newstack(SB)
   389  	MOVQ	$0, 0x1003	// crash if newstack returns
   390  	RET
   391  
   392  // morestack but not preserving ctxt.
   393  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   394  	MOVL	$0, DX
   395  	JMP	runtime·morestack(SB)
   396  
   397  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   398  	// We came here via a RET to an overwritten return PC.
   399  	// AX may be live. Other registers are available.
   400  
   401  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   402  	get_tls(CX)
   403  	MOVQ	g(CX), CX
   404  	MOVQ	(g_stkbar+slice_array)(CX), DX
   405  	MOVQ	g_stkbarPos(CX), BX
   406  	IMULQ	$stkbar__size, BX	// Too big for SIB.
   407  	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
   408  	// Record that this stack barrier was hit.
   409  	ADDQ	$1, g_stkbarPos(CX)
   410  	// Jump to the original return PC.
   411  	JMP	BX
   412  
   413  // reflectcall: call a function with the given argument list
   414  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   415  // we don't have variable-sized frames, so we use a small number
   416  // of constant-sized-frame functions to encode a few bits of size in the pc.
   417  // Caution: ugly multiline assembly macros in your future!
   418  
   419  #define DISPATCH(NAME,MAXSIZE)		\
   420  	CMPQ	CX, $MAXSIZE;		\
   421  	JA	3(PC);			\
   422  	MOVQ	$NAME(SB), AX;		\
   423  	JMP	AX
   424  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   425  
   426  TEXT reflect·call(SB), NOSPLIT, $0-0
   427  	JMP	·reflectcall(SB)
   428  
   429  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   430  	MOVLQZX argsize+24(FP), CX
   431  	// NOTE(rsc): No call16, because CALLFN needs four words
   432  	// of argument space to invoke callwritebarrier.
   433  	DISPATCH(runtime·call32, 32)
   434  	DISPATCH(runtime·call64, 64)
   435  	DISPATCH(runtime·call128, 128)
   436  	DISPATCH(runtime·call256, 256)
   437  	DISPATCH(runtime·call512, 512)
   438  	DISPATCH(runtime·call1024, 1024)
   439  	DISPATCH(runtime·call2048, 2048)
   440  	DISPATCH(runtime·call4096, 4096)
   441  	DISPATCH(runtime·call8192, 8192)
   442  	DISPATCH(runtime·call16384, 16384)
   443  	DISPATCH(runtime·call32768, 32768)
   444  	DISPATCH(runtime·call65536, 65536)
   445  	DISPATCH(runtime·call131072, 131072)
   446  	DISPATCH(runtime·call262144, 262144)
   447  	DISPATCH(runtime·call524288, 524288)
   448  	DISPATCH(runtime·call1048576, 1048576)
   449  	DISPATCH(runtime·call2097152, 2097152)
   450  	DISPATCH(runtime·call4194304, 4194304)
   451  	DISPATCH(runtime·call8388608, 8388608)
   452  	DISPATCH(runtime·call16777216, 16777216)
   453  	DISPATCH(runtime·call33554432, 33554432)
   454  	DISPATCH(runtime·call67108864, 67108864)
   455  	DISPATCH(runtime·call134217728, 134217728)
   456  	DISPATCH(runtime·call268435456, 268435456)
   457  	DISPATCH(runtime·call536870912, 536870912)
   458  	DISPATCH(runtime·call1073741824, 1073741824)
   459  	MOVQ	$runtime·badreflectcall(SB), AX
   460  	JMP	AX
   461  
   462  #define CALLFN(NAME,MAXSIZE)			\
   463  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   464  	NO_LOCAL_POINTERS;			\
   465  	/* copy arguments to stack */		\
   466  	MOVQ	argptr+16(FP), SI;		\
   467  	MOVLQZX argsize+24(FP), CX;		\
   468  	MOVQ	SP, DI;				\
   469  	REP;MOVSB;				\
   470  	/* call function */			\
   471  	MOVQ	f+8(FP), DX;			\
   472  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   473  	CALL	(DX);				\
   474  	/* copy return values back */		\
   475  	MOVQ	argptr+16(FP), DI;		\
   476  	MOVLQZX	argsize+24(FP), CX;		\
   477  	MOVLQZX retoffset+28(FP), BX;		\
   478  	MOVQ	SP, SI;				\
   479  	ADDQ	BX, DI;				\
   480  	ADDQ	BX, SI;				\
   481  	SUBQ	BX, CX;				\
   482  	REP;MOVSB;				\
   483  	/* execute write barrier updates */	\
   484  	MOVQ	argtype+0(FP), DX;		\
   485  	MOVQ	argptr+16(FP), DI;		\
   486  	MOVLQZX	argsize+24(FP), CX;		\
   487  	MOVLQZX retoffset+28(FP), BX;		\
   488  	MOVQ	DX, 0(SP);			\
   489  	MOVQ	DI, 8(SP);			\
   490  	MOVQ	CX, 16(SP);			\
   491  	MOVQ	BX, 24(SP);			\
   492  	CALL	runtime·callwritebarrier(SB);	\
   493  	RET
   494  
   495  CALLFN(·call32, 32)
   496  CALLFN(·call64, 64)
   497  CALLFN(·call128, 128)
   498  CALLFN(·call256, 256)
   499  CALLFN(·call512, 512)
   500  CALLFN(·call1024, 1024)
   501  CALLFN(·call2048, 2048)
   502  CALLFN(·call4096, 4096)
   503  CALLFN(·call8192, 8192)
   504  CALLFN(·call16384, 16384)
   505  CALLFN(·call32768, 32768)
   506  CALLFN(·call65536, 65536)
   507  CALLFN(·call131072, 131072)
   508  CALLFN(·call262144, 262144)
   509  CALLFN(·call524288, 524288)
   510  CALLFN(·call1048576, 1048576)
   511  CALLFN(·call2097152, 2097152)
   512  CALLFN(·call4194304, 4194304)
   513  CALLFN(·call8388608, 8388608)
   514  CALLFN(·call16777216, 16777216)
   515  CALLFN(·call33554432, 33554432)
   516  CALLFN(·call67108864, 67108864)
   517  CALLFN(·call134217728, 134217728)
   518  CALLFN(·call268435456, 268435456)
   519  CALLFN(·call536870912, 536870912)
   520  CALLFN(·call1073741824, 1073741824)
   521  
   522  // bool cas(int32 *val, int32 old, int32 new)
   523  // Atomically:
   524  //	if(*val == old){
   525  //		*val = new;
   526  //		return 1;
   527  //	} else
   528  //		return 0;
   529  TEXT runtime·cas(SB), NOSPLIT, $0-17
   530  	MOVQ	ptr+0(FP), BX
   531  	MOVL	old+8(FP), AX
   532  	MOVL	new+12(FP), CX
   533  	LOCK
   534  	CMPXCHGL	CX, 0(BX)
   535  	SETEQ	ret+16(FP)
   536  	RET
   537  
   538  // bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
   539  // Atomically:
   540  //	if(*val == *old){
   541  //		*val = new;
   542  //		return 1;
   543  //	} else {
   544  //		return 0;
   545  //	}
   546  TEXT runtime·cas64(SB), NOSPLIT, $0-25
   547  	MOVQ	ptr+0(FP), BX
   548  	MOVQ	old+8(FP), AX
   549  	MOVQ	new+16(FP), CX
   550  	LOCK
   551  	CMPXCHGQ	CX, 0(BX)
   552  	SETEQ	ret+24(FP)
   553  	RET
   554  	
   555  TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
   556  	JMP	runtime·cas64(SB)
   557  
   558  TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16
   559  	JMP	runtime·atomicload64(SB)
   560  
   561  TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16
   562  	JMP	runtime·atomicload64(SB)
   563  
   564  TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
   565  	JMP	runtime·atomicstore64(SB)
   566  
   567  // bool casp(void **val, void *old, void *new)
   568  // Atomically:
   569  //	if(*val == old){
   570  //		*val = new;
   571  //		return 1;
   572  //	} else
   573  //		return 0;
   574  TEXT runtime·casp1(SB), NOSPLIT, $0-25
   575  	MOVQ	ptr+0(FP), BX
   576  	MOVQ	old+8(FP), AX
   577  	MOVQ	new+16(FP), CX
   578  	LOCK
   579  	CMPXCHGQ	CX, 0(BX)
   580  	SETEQ	ret+24(FP)
   581  	RET
   582  
   583  // uint32 xadd(uint32 volatile *val, int32 delta)
   584  // Atomically:
   585  //	*val += delta;
   586  //	return *val;
   587  TEXT runtime·xadd(SB), NOSPLIT, $0-20
   588  	MOVQ	ptr+0(FP), BX
   589  	MOVL	delta+8(FP), AX
   590  	MOVL	AX, CX
   591  	LOCK
   592  	XADDL	AX, 0(BX)
   593  	ADDL	CX, AX
   594  	MOVL	AX, ret+16(FP)
   595  	RET
   596  
   597  TEXT runtime·xadd64(SB), NOSPLIT, $0-24
   598  	MOVQ	ptr+0(FP), BX
   599  	MOVQ	delta+8(FP), AX
   600  	MOVQ	AX, CX
   601  	LOCK
   602  	XADDQ	AX, 0(BX)
   603  	ADDQ	CX, AX
   604  	MOVQ	AX, ret+16(FP)
   605  	RET
   606  
   607  TEXT runtime·xadduintptr(SB), NOSPLIT, $0-24
   608  	JMP	runtime·xadd64(SB)
   609  
   610  TEXT runtime·xchg(SB), NOSPLIT, $0-20
   611  	MOVQ	ptr+0(FP), BX
   612  	MOVL	new+8(FP), AX
   613  	XCHGL	AX, 0(BX)
   614  	MOVL	AX, ret+16(FP)
   615  	RET
   616  
   617  TEXT runtime·xchg64(SB), NOSPLIT, $0-24
   618  	MOVQ	ptr+0(FP), BX
   619  	MOVQ	new+8(FP), AX
   620  	XCHGQ	AX, 0(BX)
   621  	MOVQ	AX, ret+16(FP)
   622  	RET
   623  
   624  TEXT runtime·xchgp1(SB), NOSPLIT, $0-24
   625  	MOVQ	ptr+0(FP), BX
   626  	MOVQ	new+8(FP), AX
   627  	XCHGQ	AX, 0(BX)
   628  	MOVQ	AX, ret+16(FP)
   629  	RET
   630  
   631  TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
   632  	JMP	runtime·xchg64(SB)
   633  
   634  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   635  	MOVL	cycles+0(FP), AX
   636  again:
   637  	PAUSE
   638  	SUBL	$1, AX
   639  	JNZ	again
   640  	RET
   641  
   642  TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
   643  	MOVQ	ptr+0(FP), BX
   644  	MOVQ	val+8(FP), AX
   645  	XCHGQ	AX, 0(BX)
   646  	RET
   647  
   648  TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
   649  	MOVQ	ptr+0(FP), BX
   650  	MOVL	val+8(FP), AX
   651  	XCHGL	AX, 0(BX)
   652  	RET
   653  
   654  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
   655  	MOVQ	ptr+0(FP), BX
   656  	MOVQ	val+8(FP), AX
   657  	XCHGQ	AX, 0(BX)
   658  	RET
   659  
   660  // void	runtime·atomicor8(byte volatile*, byte);
   661  TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
   662  	MOVQ	ptr+0(FP), AX
   663  	MOVB	val+8(FP), BX
   664  	LOCK
   665  	ORB	BX, (AX)
   666  	RET
   667  
   668  // void	runtime·atomicand8(byte volatile*, byte);
   669  TEXT runtime·atomicand8(SB), NOSPLIT, $0-9
   670  	MOVQ	ptr+0(FP), AX
   671  	MOVB	val+8(FP), BX
   672  	LOCK
   673  	ANDB	BX, (AX)
   674  	RET
   675  
   676  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   677  	// Stores are already ordered on x86, so this is just a
   678  	// compile barrier.
   679  	RET
   680  
   681  // void jmpdefer(fn, sp);
   682  // called from deferreturn.
   683  // 1. pop the caller
   684  // 2. sub 5 bytes from the callers return
   685  // 3. jmp to the argument
   686  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   687  	MOVQ	fv+0(FP), DX	// fn
   688  	MOVQ	argp+8(FP), BX	// caller sp
   689  	LEAQ	-8(BX), SP	// caller sp after CALL
   690  	SUBQ	$5, (SP)	// return to CALL again
   691  	MOVQ	0(DX), BX
   692  	JMP	BX	// but first run the deferred function
   693  
   694  // Save state of caller into g->sched. Smashes R8, R9.
   695  TEXT gosave<>(SB),NOSPLIT,$0
   696  	get_tls(R8)
   697  	MOVQ	g(R8), R8
   698  	MOVQ	0(SP), R9
   699  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   700  	LEAQ	8(SP), R9
   701  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   702  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   703  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   704  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   705  	RET
   706  
   707  // func asmcgocall(fn, arg unsafe.Pointer) int32
   708  // Call fn(arg) on the scheduler stack,
   709  // aligned appropriately for the gcc ABI.
   710  // See cgocall.go for more details.
   711  TEXT ·asmcgocall(SB),NOSPLIT,$0-20
   712  	MOVQ	fn+0(FP), AX
   713  	MOVQ	arg+8(FP), BX
   714  
   715  	MOVQ	SP, DX
   716  
   717  	// Figure out if we need to switch to m->g0 stack.
   718  	// We get called to create new OS threads too, and those
   719  	// come in on the m->g0 stack already.
   720  	get_tls(CX)
   721  	MOVQ	g(CX), R8
   722  	MOVQ	g_m(R8), R8
   723  	MOVQ	m_g0(R8), SI
   724  	MOVQ	g(CX), DI
   725  	CMPQ	SI, DI
   726  	JEQ	nosave
   727  	MOVQ	m_gsignal(R8), SI
   728  	CMPQ	SI, DI
   729  	JEQ	nosave
   730  	
   731  	MOVQ	m_g0(R8), SI
   732  	CALL	gosave<>(SB)
   733  	MOVQ	SI, g(CX)
   734  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   735  nosave:
   736  
   737  	// Now on a scheduling stack (a pthread-created stack).
   738  	// Make sure we have enough room for 4 stack-backed fast-call
   739  	// registers as per windows amd64 calling convention.
   740  	SUBQ	$64, SP
   741  	ANDQ	$~15, SP	// alignment for gcc ABI
   742  	MOVQ	DI, 48(SP)	// save g
   743  	MOVQ	(g_stack+stack_hi)(DI), DI
   744  	SUBQ	DX, DI
   745  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   746  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   747  	MOVQ	BX, CX		// CX = first argument in Win64
   748  	CALL	AX
   749  
   750  	// Restore registers, g, stack pointer.
   751  	get_tls(CX)
   752  	MOVQ	48(SP), DI
   753  	MOVQ	(g_stack+stack_hi)(DI), SI
   754  	SUBQ	40(SP), SI
   755  	MOVQ	DI, g(CX)
   756  	MOVQ	SI, SP
   757  
   758  	MOVL	AX, ret+16(FP)
   759  	RET
   760  
   761  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   762  // Turn the fn into a Go func (by taking its address) and call
   763  // cgocallback_gofunc.
   764  TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
   765  	LEAQ	fn+0(FP), AX
   766  	MOVQ	AX, 0(SP)
   767  	MOVQ	frame+8(FP), AX
   768  	MOVQ	AX, 8(SP)
   769  	MOVQ	framesize+16(FP), AX
   770  	MOVQ	AX, 16(SP)
   771  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   772  	CALL	AX
   773  	RET
   774  
   775  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   776  // See cgocall.go for more details.
   777  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
   778  	NO_LOCAL_POINTERS
   779  
   780  	// If g is nil, Go did not create the current thread.
   781  	// Call needm to obtain one m for temporary use.
   782  	// In this case, we're running on the thread stack, so there's
   783  	// lots of space, but the linker doesn't know. Hide the call from
   784  	// the linker analysis by using an indirect call through AX.
   785  	get_tls(CX)
   786  #ifdef GOOS_windows
   787  	MOVL	$0, BX
   788  	CMPQ	CX, $0
   789  	JEQ	2(PC)
   790  #endif
   791  	MOVQ	g(CX), BX
   792  	CMPQ	BX, $0
   793  	JEQ	needm
   794  	MOVQ	g_m(BX), BX
   795  	MOVQ	BX, R8 // holds oldm until end of function
   796  	JMP	havem
   797  needm:
   798  	MOVQ	$0, 0(SP)
   799  	MOVQ	$runtime·needm(SB), AX
   800  	CALL	AX
   801  	MOVQ	0(SP), R8
   802  	get_tls(CX)
   803  	MOVQ	g(CX), BX
   804  	MOVQ	g_m(BX), BX
   805  	
   806  	// Set m->sched.sp = SP, so that if a panic happens
   807  	// during the function we are about to execute, it will
   808  	// have a valid SP to run on the g0 stack.
   809  	// The next few lines (after the havem label)
   810  	// will save this SP onto the stack and then write
   811  	// the same SP back to m->sched.sp. That seems redundant,
   812  	// but if an unrecovered panic happens, unwindm will
   813  	// restore the g->sched.sp from the stack location
   814  	// and then systemstack will try to use it. If we don't set it here,
   815  	// that restored SP will be uninitialized (typically 0) and
   816  	// will not be usable.
   817  	MOVQ	m_g0(BX), SI
   818  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   819  
   820  havem:
   821  	// Now there's a valid m, and we're running on its m->g0.
   822  	// Save current m->g0->sched.sp on stack and then set it to SP.
   823  	// Save current sp in m->g0->sched.sp in preparation for
   824  	// switch back to m->curg stack.
   825  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   826  	MOVQ	m_g0(BX), SI
   827  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   828  	MOVQ	AX, 0(SP)
   829  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   830  
   831  	// Switch to m->curg stack and call runtime.cgocallbackg.
   832  	// Because we are taking over the execution of m->curg
   833  	// but *not* resuming what had been running, we need to
   834  	// save that information (m->curg->sched) so we can restore it.
   835  	// We can restore m->curg->sched.sp easily, because calling
   836  	// runtime.cgocallbackg leaves SP unchanged upon return.
   837  	// To save m->curg->sched.pc, we push it onto the stack.
   838  	// This has the added benefit that it looks to the traceback
   839  	// routine like cgocallbackg is going to return to that
   840  	// PC (because the frame we allocate below has the same
   841  	// size as cgocallback_gofunc's frame declared above)
   842  	// so that the traceback will seamlessly trace back into
   843  	// the earlier calls.
   844  	//
   845  	// In the new goroutine, 0(SP) holds the saved R8.
   846  	MOVQ	m_curg(BX), SI
   847  	MOVQ	SI, g(CX)
   848  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   849  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   850  	MOVQ	BX, -8(DI)
   851  	// Compute the size of the frame, including return PC and, if
   852  	// GOEXPERIMENT=framepointer, the saved based pointer
   853  	LEAQ	fv+0(FP), AX
   854  	SUBQ	SP, AX
   855  	SUBQ	AX, DI
   856  	MOVQ	DI, SP
   857  
   858  	MOVQ	R8, 0(SP)
   859  	CALL	runtime·cgocallbackg(SB)
   860  	MOVQ	0(SP), R8
   861  
   862  	// Compute the size of the frame again.  FP and SP have
   863  	// completely different values here than they did above,
   864  	// but only their difference matters.
   865  	LEAQ	fv+0(FP), AX
   866  	SUBQ	SP, AX
   867  
   868  	// Restore g->sched (== m->curg->sched) from saved values.
   869  	get_tls(CX)
   870  	MOVQ	g(CX), SI
   871  	MOVQ	SP, DI
   872  	ADDQ	AX, DI
   873  	MOVQ	-8(DI), BX
   874  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   875  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   876  
   877  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   878  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   879  	// so we do not have to restore it.)
   880  	MOVQ	g(CX), BX
   881  	MOVQ	g_m(BX), BX
   882  	MOVQ	m_g0(BX), SI
   883  	MOVQ	SI, g(CX)
   884  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   885  	MOVQ	0(SP), AX
   886  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   887  	
   888  	// If the m on entry was nil, we called needm above to borrow an m
   889  	// for the duration of the call. Since the call is over, return it with dropm.
   890  	CMPQ	R8, $0
   891  	JNE 3(PC)
   892  	MOVQ	$runtime·dropm(SB), AX
   893  	CALL	AX
   894  
   895  	// Done!
   896  	RET
   897  
   898  // void setg(G*); set g. for use by needm.
   899  TEXT runtime·setg(SB), NOSPLIT, $0-8
   900  	MOVQ	gg+0(FP), BX
   901  #ifdef GOOS_windows
   902  	CMPQ	BX, $0
   903  	JNE	settls
   904  	MOVQ	$0, 0x28(GS)
   905  	RET
   906  settls:
   907  	MOVQ	g_m(BX), AX
   908  	LEAQ	m_tls(AX), AX
   909  	MOVQ	AX, 0x28(GS)
   910  #endif
   911  	get_tls(CX)
   912  	MOVQ	BX, g(CX)
   913  	RET
   914  
   915  // void setg_gcc(G*); set g called from gcc.
   916  TEXT setg_gcc<>(SB),NOSPLIT,$0
   917  	get_tls(AX)
   918  	MOVQ	DI, g(AX)
   919  	RET
   920  
   921  // check that SP is in range [g->stack.lo, g->stack.hi)
   922  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   923  	get_tls(CX)
   924  	MOVQ	g(CX), AX
   925  	CMPQ	(g_stack+stack_hi)(AX), SP
   926  	JHI	2(PC)
   927  	INT	$3
   928  	CMPQ	SP, (g_stack+stack_lo)(AX)
   929  	JHI	2(PC)
   930  	INT	$3
   931  	RET
   932  
   933  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
   934  	MOVQ	argp+0(FP),AX		// addr of first arg
   935  	MOVQ	-8(AX),AX		// get calling pc
   936  	CMPQ	AX, runtime·stackBarrierPC(SB)
   937  	JNE	nobar
   938  	// Get original return PC.
   939  	CALL	runtime·nextBarrierPC(SB)
   940  	MOVQ	0(SP), AX
   941  nobar:
   942  	MOVQ	AX, ret+8(FP)
   943  	RET
   944  
   945  TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
   946  	MOVQ	argp+0(FP),AX		// addr of first arg
   947  	MOVQ	pc+8(FP), BX
   948  	MOVQ	-8(AX), CX
   949  	CMPQ	CX, runtime·stackBarrierPC(SB)
   950  	JEQ	setbar
   951  	MOVQ	BX, -8(AX)		// set calling pc
   952  	RET
   953  setbar:
   954  	// Set the stack barrier return PC.
   955  	MOVQ	BX, 0(SP)
   956  	CALL	runtime·setNextBarrierPC(SB)
   957  	RET
   958  
   959  TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   960  	MOVQ	argp+0(FP), AX
   961  	MOVQ	AX, ret+8(FP)
   962  	RET
   963  
   964  // func cputicks() int64
   965  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   966  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   967  	JNE	mfence
   968  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   969  	JMP	done
   970  mfence:
   971  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   972  done:
   973  	RDTSC
   974  	SHLQ	$32, DX
   975  	ADDQ	DX, AX
   976  	MOVQ	AX, ret+0(FP)
   977  	RET
   978  
   979  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   980  // redirects to memhash(p, h, size) using the size
   981  // stored in the closure.
   982  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   983  	GO_ARGS
   984  	NO_LOCAL_POINTERS
   985  	MOVQ	p+0(FP), AX
   986  	MOVQ	h+8(FP), BX
   987  	MOVQ	8(DX), CX
   988  	MOVQ	AX, 0(SP)
   989  	MOVQ	BX, 8(SP)
   990  	MOVQ	CX, 16(SP)
   991  	CALL	runtime·memhash(SB)
   992  	MOVQ	24(SP), AX
   993  	MOVQ	AX, ret+16(FP)
   994  	RET
   995  
   996  // hash function using AES hardware instructions
   997  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   998  	MOVQ	p+0(FP), AX	// ptr to data
   999  	MOVQ	s+16(FP), CX	// size
  1000  	LEAQ	ret+24(FP), DX
  1001  	JMP	runtime·aeshashbody(SB)
  1002  
  1003  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
  1004  	MOVQ	p+0(FP), AX	// ptr to string struct
  1005  	MOVQ	8(AX), CX	// length of string
  1006  	MOVQ	(AX), AX	// string data
  1007  	LEAQ	ret+16(FP), DX
  1008  	JMP	runtime·aeshashbody(SB)
  1009  
  1010  // AX: data
  1011  // CX: length
  1012  // DX: address to put return value
  1013  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
  1014  	MOVQ	h+8(FP), X6	// seed to low 64 bits of xmm6
  1015  	PINSRQ	$1, CX, X6	// size to high 64 bits of xmm6
  1016  	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
  1017  	MOVO	runtime·aeskeysched(SB), X7
  1018  	CMPQ	CX, $16
  1019  	JB	aes0to15
  1020  	JE	aes16
  1021  	CMPQ	CX, $32
  1022  	JBE	aes17to32
  1023  	CMPQ	CX, $64
  1024  	JBE	aes33to64
  1025  	CMPQ	CX, $128
  1026  	JBE	aes65to128
  1027  	JMP	aes129plus
  1028  
  1029  aes0to15:
  1030  	TESTQ	CX, CX
  1031  	JE	aes0
  1032  
  1033  	ADDQ	$16, AX
  1034  	TESTW	$0xff0, AX
  1035  	JE	endofpage
  1036  
  1037  	// 16 bytes loaded at this address won't cross
  1038  	// a page boundary, so we can load it directly.
  1039  	MOVOU	-16(AX), X0
  1040  	ADDQ	CX, CX
  1041  	MOVQ	$masks<>(SB), AX
  1042  	PAND	(AX)(CX*8), X0
  1043  
  1044  	// scramble 3 times
  1045  	AESENC	X6, X0
  1046  	AESENC	X7, X0
  1047  	AESENC	X7, X0
  1048  	MOVQ	X0, (DX)
  1049  	RET
  1050  
  1051  endofpage:
  1052  	// address ends in 1111xxxx.  Might be up against
  1053  	// a page boundary, so load ending at last byte.
  1054  	// Then shift bytes down using pshufb.
  1055  	MOVOU	-32(AX)(CX*1), X0
  1056  	ADDQ	CX, CX
  1057  	MOVQ	$shifts<>(SB), AX
  1058  	PSHUFB	(AX)(CX*8), X0
  1059  	AESENC	X6, X0
  1060  	AESENC	X7, X0
  1061  	AESENC	X7, X0
  1062  	MOVQ	X0, (DX)
  1063  	RET
  1064  
  1065  aes0:
  1066  	// return input seed
  1067  	MOVQ	h+8(FP), AX
  1068  	MOVQ	AX, (DX)
  1069  	RET
  1070  
  1071  aes16:
  1072  	MOVOU	(AX), X0
  1073  	AESENC	X6, X0
  1074  	AESENC	X7, X0
  1075  	AESENC	X7, X0
  1076  	MOVQ	X0, (DX)
  1077  	RET
  1078  
  1079  aes17to32:
  1080  	// load data to be hashed
  1081  	MOVOU	(AX), X0
  1082  	MOVOU	-16(AX)(CX*1), X1
  1083  
  1084  	// scramble 3 times
  1085  	AESENC	X6, X0
  1086  	AESENC	runtime·aeskeysched+16(SB), X1
  1087  	AESENC	X7, X0
  1088  	AESENC	X7, X1
  1089  	AESENC	X7, X0
  1090  	AESENC	X7, X1
  1091  
  1092  	// combine results
  1093  	PXOR	X1, X0
  1094  	MOVQ	X0, (DX)
  1095  	RET
  1096  
  1097  aes33to64:
  1098  	MOVOU	(AX), X0
  1099  	MOVOU	16(AX), X1
  1100  	MOVOU	-32(AX)(CX*1), X2
  1101  	MOVOU	-16(AX)(CX*1), X3
  1102  	
  1103  	AESENC	X6, X0
  1104  	AESENC	runtime·aeskeysched+16(SB), X1
  1105  	AESENC	runtime·aeskeysched+32(SB), X2
  1106  	AESENC	runtime·aeskeysched+48(SB), X3
  1107  	AESENC	X7, X0
  1108  	AESENC	X7, X1
  1109  	AESENC	X7, X2
  1110  	AESENC	X7, X3
  1111  	AESENC	X7, X0
  1112  	AESENC	X7, X1
  1113  	AESENC	X7, X2
  1114  	AESENC	X7, X3
  1115  
  1116  	PXOR	X2, X0
  1117  	PXOR	X3, X1
  1118  	PXOR	X1, X0
  1119  	MOVQ	X0, (DX)
  1120  	RET
  1121  
  1122  aes65to128:
  1123  	MOVOU	(AX), X0
  1124  	MOVOU	16(AX), X1
  1125  	MOVOU	32(AX), X2
  1126  	MOVOU	48(AX), X3
  1127  	MOVOU	-64(AX)(CX*1), X4
  1128  	MOVOU	-48(AX)(CX*1), X5
  1129  	MOVOU	-32(AX)(CX*1), X8
  1130  	MOVOU	-16(AX)(CX*1), X9
  1131  	
  1132  	AESENC	X6, X0
  1133  	AESENC	runtime·aeskeysched+16(SB), X1
  1134  	AESENC	runtime·aeskeysched+32(SB), X2
  1135  	AESENC	runtime·aeskeysched+48(SB), X3
  1136  	AESENC	runtime·aeskeysched+64(SB), X4
  1137  	AESENC	runtime·aeskeysched+80(SB), X5
  1138  	AESENC	runtime·aeskeysched+96(SB), X8
  1139  	AESENC	runtime·aeskeysched+112(SB), X9
  1140  	AESENC	X7, X0
  1141  	AESENC	X7, X1
  1142  	AESENC	X7, X2
  1143  	AESENC	X7, X3
  1144  	AESENC	X7, X4
  1145  	AESENC	X7, X5
  1146  	AESENC	X7, X8
  1147  	AESENC	X7, X9
  1148  	AESENC	X7, X0
  1149  	AESENC	X7, X1
  1150  	AESENC	X7, X2
  1151  	AESENC	X7, X3
  1152  	AESENC	X7, X4
  1153  	AESENC	X7, X5
  1154  	AESENC	X7, X8
  1155  	AESENC	X7, X9
  1156  
  1157  	PXOR	X4, X0
  1158  	PXOR	X5, X1
  1159  	PXOR	X8, X2
  1160  	PXOR	X9, X3
  1161  	PXOR	X2, X0
  1162  	PXOR	X3, X1
  1163  	PXOR	X1, X0
  1164  	MOVQ	X0, (DX)
  1165  	RET
  1166  
  1167  aes129plus:
  1168  	// start with last (possibly overlapping) block
  1169  	MOVOU	-128(AX)(CX*1), X0
  1170  	MOVOU	-112(AX)(CX*1), X1
  1171  	MOVOU	-96(AX)(CX*1), X2
  1172  	MOVOU	-80(AX)(CX*1), X3
  1173  	MOVOU	-64(AX)(CX*1), X4
  1174  	MOVOU	-48(AX)(CX*1), X5
  1175  	MOVOU	-32(AX)(CX*1), X8
  1176  	MOVOU	-16(AX)(CX*1), X9
  1177  
  1178  	// scramble state once
  1179  	AESENC	X6, X0
  1180  	AESENC	runtime·aeskeysched+16(SB), X1
  1181  	AESENC	runtime·aeskeysched+32(SB), X2
  1182  	AESENC	runtime·aeskeysched+48(SB), X3
  1183  	AESENC	runtime·aeskeysched+64(SB), X4
  1184  	AESENC	runtime·aeskeysched+80(SB), X5
  1185  	AESENC	runtime·aeskeysched+96(SB), X8
  1186  	AESENC	runtime·aeskeysched+112(SB), X9
  1187  
  1188  	// compute number of remaining 128-byte blocks
  1189  	DECQ	CX
  1190  	SHRQ	$7, CX
  1191  	
  1192  aesloop:
  1193  	// scramble state, xor in a block
  1194  	MOVOU	(AX), X10
  1195  	MOVOU	16(AX), X11
  1196  	MOVOU	32(AX), X12
  1197  	MOVOU	48(AX), X13
  1198  	AESENC	X10, X0
  1199  	AESENC	X11, X1
  1200  	AESENC	X12, X2
  1201  	AESENC	X13, X3
  1202  	MOVOU	64(AX), X10
  1203  	MOVOU	80(AX), X11
  1204  	MOVOU	96(AX), X12
  1205  	MOVOU	112(AX), X13
  1206  	AESENC	X10, X4
  1207  	AESENC	X11, X5
  1208  	AESENC	X12, X8
  1209  	AESENC	X13, X9
  1210  
  1211  	// scramble state
  1212  	AESENC	X7, X0
  1213  	AESENC	X7, X1
  1214  	AESENC	X7, X2
  1215  	AESENC	X7, X3
  1216  	AESENC	X7, X4
  1217  	AESENC	X7, X5
  1218  	AESENC	X7, X8
  1219  	AESENC	X7, X9
  1220  
  1221  	ADDQ	$128, AX
  1222  	DECQ	CX
  1223  	JNE	aesloop
  1224  
  1225  	// 2 more scrambles to finish
  1226  	AESENC	X7, X0
  1227  	AESENC	X7, X1
  1228  	AESENC	X7, X2
  1229  	AESENC	X7, X3
  1230  	AESENC	X7, X4
  1231  	AESENC	X7, X5
  1232  	AESENC	X7, X8
  1233  	AESENC	X7, X9
  1234  	AESENC	X7, X0
  1235  	AESENC	X7, X1
  1236  	AESENC	X7, X2
  1237  	AESENC	X7, X3
  1238  	AESENC	X7, X4
  1239  	AESENC	X7, X5
  1240  	AESENC	X7, X8
  1241  	AESENC	X7, X9
  1242  
  1243  	PXOR	X4, X0
  1244  	PXOR	X5, X1
  1245  	PXOR	X8, X2
  1246  	PXOR	X9, X3
  1247  	PXOR	X2, X0
  1248  	PXOR	X3, X1
  1249  	PXOR	X1, X0
  1250  	MOVQ	X0, (DX)
  1251  	RET
  1252  	
  1253  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1254  	MOVQ	p+0(FP), AX	// ptr to data
  1255  	MOVQ	h+8(FP), X0	// seed
  1256  	PINSRD	$2, (AX), X0	// data
  1257  	AESENC	runtime·aeskeysched+0(SB), X0
  1258  	AESENC	runtime·aeskeysched+16(SB), X0
  1259  	AESENC	runtime·aeskeysched+32(SB), X0
  1260  	MOVQ	X0, ret+16(FP)
  1261  	RET
  1262  
  1263  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1264  	MOVQ	p+0(FP), AX	// ptr to data
  1265  	MOVQ	h+8(FP), X0	// seed
  1266  	PINSRQ	$1, (AX), X0	// data
  1267  	AESENC	runtime·aeskeysched+0(SB), X0
  1268  	AESENC	runtime·aeskeysched+16(SB), X0
  1269  	AESENC	runtime·aeskeysched+32(SB), X0
  1270  	MOVQ	X0, ret+16(FP)
  1271  	RET
  1272  
  1273  // simple mask to get rid of data in the high part of the register.
  1274  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1275  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1276  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1277  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1278  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1279  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1280  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1281  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1282  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1283  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1284  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1285  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1286  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1287  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1288  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1289  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1290  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1291  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1292  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1293  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1294  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1295  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1296  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1297  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1298  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1299  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1300  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1301  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1302  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1303  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1304  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1305  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1306  GLOBL masks<>(SB),RODATA,$256
  1307  
  1308  // these are arguments to pshufb.  They move data down from
  1309  // the high bytes of the register to the low bytes of the register.
  1310  // index is how many bytes to move.
  1311  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1312  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1313  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1314  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1315  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1316  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1317  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1318  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1319  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1320  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1321  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1322  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1323  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1324  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1325  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1326  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1327  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1328  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1329  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1330  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1331  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1332  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1333  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1334  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1335  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1336  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1337  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1338  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1339  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1340  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1341  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1342  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1343  GLOBL shifts<>(SB),RODATA,$256
  1344  
  1345  TEXT runtime·memeq(SB),NOSPLIT,$0-25
  1346  	MOVQ	a+0(FP), SI
  1347  	MOVQ	b+8(FP), DI
  1348  	MOVQ	size+16(FP), BX
  1349  	LEAQ	ret+24(FP), AX
  1350  	JMP	runtime·memeqbody(SB)
  1351  
  1352  // memequal_varlen(a, b unsafe.Pointer) bool
  1353  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1354  	MOVQ	a+0(FP), SI
  1355  	MOVQ	b+8(FP), DI
  1356  	CMPQ	SI, DI
  1357  	JEQ	eq
  1358  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1359  	LEAQ	ret+16(FP), AX
  1360  	JMP	runtime·memeqbody(SB)
  1361  eq:
  1362  	MOVB	$1, ret+16(FP)
  1363  	RET
  1364  
  1365  // eqstring tests whether two strings are equal.
  1366  // The compiler guarantees that strings passed
  1367  // to eqstring have equal length.
  1368  // See runtime_test.go:eqstring_generic for
  1369  // equivalent Go code.
  1370  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1371  	MOVQ	s1str+0(FP), SI
  1372  	MOVQ	s2str+16(FP), DI
  1373  	CMPQ	SI, DI
  1374  	JEQ	eq
  1375  	MOVQ	s1len+8(FP), BX
  1376  	LEAQ	v+32(FP), AX
  1377  	JMP	runtime·memeqbody(SB)
  1378  eq:
  1379  	MOVB	$1, v+32(FP)
  1380  	RET
  1381  
  1382  // a in SI
  1383  // b in DI
  1384  // count in BX
  1385  // address of result byte in AX
  1386  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1387  	CMPQ	BX, $8
  1388  	JB	small
  1389  	
  1390  	// 64 bytes at a time using xmm registers
  1391  hugeloop:
  1392  	CMPQ	BX, $64
  1393  	JB	bigloop
  1394  	MOVOU	(SI), X0
  1395  	MOVOU	(DI), X1
  1396  	MOVOU	16(SI), X2
  1397  	MOVOU	16(DI), X3
  1398  	MOVOU	32(SI), X4
  1399  	MOVOU	32(DI), X5
  1400  	MOVOU	48(SI), X6
  1401  	MOVOU	48(DI), X7
  1402  	PCMPEQB	X1, X0
  1403  	PCMPEQB	X3, X2
  1404  	PCMPEQB	X5, X4
  1405  	PCMPEQB	X7, X6
  1406  	PAND	X2, X0
  1407  	PAND	X6, X4
  1408  	PAND	X4, X0
  1409  	PMOVMSKB X0, DX
  1410  	ADDQ	$64, SI
  1411  	ADDQ	$64, DI
  1412  	SUBQ	$64, BX
  1413  	CMPL	DX, $0xffff
  1414  	JEQ	hugeloop
  1415  	MOVB	$0, (AX)
  1416  	RET
  1417  
  1418  	// 8 bytes at a time using 64-bit register
  1419  bigloop:
  1420  	CMPQ	BX, $8
  1421  	JBE	leftover
  1422  	MOVQ	(SI), CX
  1423  	MOVQ	(DI), DX
  1424  	ADDQ	$8, SI
  1425  	ADDQ	$8, DI
  1426  	SUBQ	$8, BX
  1427  	CMPQ	CX, DX
  1428  	JEQ	bigloop
  1429  	MOVB	$0, (AX)
  1430  	RET
  1431  
  1432  	// remaining 0-8 bytes
  1433  leftover:
  1434  	MOVQ	-8(SI)(BX*1), CX
  1435  	MOVQ	-8(DI)(BX*1), DX
  1436  	CMPQ	CX, DX
  1437  	SETEQ	(AX)
  1438  	RET
  1439  
  1440  small:
  1441  	CMPQ	BX, $0
  1442  	JEQ	equal
  1443  
  1444  	LEAQ	0(BX*8), CX
  1445  	NEGQ	CX
  1446  
  1447  	CMPB	SI, $0xf8
  1448  	JA	si_high
  1449  
  1450  	// load at SI won't cross a page boundary.
  1451  	MOVQ	(SI), SI
  1452  	JMP	si_finish
  1453  si_high:
  1454  	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
  1455  	MOVQ	-8(SI)(BX*1), SI
  1456  	SHRQ	CX, SI
  1457  si_finish:
  1458  
  1459  	// same for DI.
  1460  	CMPB	DI, $0xf8
  1461  	JA	di_high
  1462  	MOVQ	(DI), DI
  1463  	JMP	di_finish
  1464  di_high:
  1465  	MOVQ	-8(DI)(BX*1), DI
  1466  	SHRQ	CX, DI
  1467  di_finish:
  1468  
  1469  	SUBQ	SI, DI
  1470  	SHLQ	CX, DI
  1471  equal:
  1472  	SETEQ	(AX)
  1473  	RET
  1474  
  1475  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1476  	MOVQ	s1_base+0(FP), SI
  1477  	MOVQ	s1_len+8(FP), BX
  1478  	MOVQ	s2_base+16(FP), DI
  1479  	MOVQ	s2_len+24(FP), DX
  1480  	LEAQ	ret+32(FP), R9
  1481  	JMP	runtime·cmpbody(SB)
  1482  
  1483  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1484  	MOVQ	s1+0(FP), SI
  1485  	MOVQ	s1+8(FP), BX
  1486  	MOVQ	s2+24(FP), DI
  1487  	MOVQ	s2+32(FP), DX
  1488  	LEAQ	res+48(FP), R9
  1489  	JMP	runtime·cmpbody(SB)
  1490  
  1491  // input:
  1492  //   SI = a
  1493  //   DI = b
  1494  //   BX = alen
  1495  //   DX = blen
  1496  //   R9 = address of output word (stores -1/0/1 here)
  1497  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1498  	CMPQ	SI, DI
  1499  	JEQ	allsame
  1500  	CMPQ	BX, DX
  1501  	MOVQ	DX, R8
  1502  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1503  	CMPQ	R8, $8
  1504  	JB	small
  1505  
  1506  loop:
  1507  	CMPQ	R8, $16
  1508  	JBE	_0through16
  1509  	MOVOU	(SI), X0
  1510  	MOVOU	(DI), X1
  1511  	PCMPEQB X0, X1
  1512  	PMOVMSKB X1, AX
  1513  	XORQ	$0xffff, AX	// convert EQ to NE
  1514  	JNE	diff16	// branch if at least one byte is not equal
  1515  	ADDQ	$16, SI
  1516  	ADDQ	$16, DI
  1517  	SUBQ	$16, R8
  1518  	JMP	loop
  1519  	
  1520  	// AX = bit mask of differences
  1521  diff16:
  1522  	BSFQ	AX, BX	// index of first byte that differs
  1523  	XORQ	AX, AX
  1524  	MOVB	(SI)(BX*1), CX
  1525  	CMPB	CX, (DI)(BX*1)
  1526  	SETHI	AX
  1527  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1528  	MOVQ	AX, (R9)
  1529  	RET
  1530  
  1531  	// 0 through 16 bytes left, alen>=8, blen>=8
  1532  _0through16:
  1533  	CMPQ	R8, $8
  1534  	JBE	_0through8
  1535  	MOVQ	(SI), AX
  1536  	MOVQ	(DI), CX
  1537  	CMPQ	AX, CX
  1538  	JNE	diff8
  1539  _0through8:
  1540  	MOVQ	-8(SI)(R8*1), AX
  1541  	MOVQ	-8(DI)(R8*1), CX
  1542  	CMPQ	AX, CX
  1543  	JEQ	allsame
  1544  
  1545  	// AX and CX contain parts of a and b that differ.
  1546  diff8:
  1547  	BSWAPQ	AX	// reverse order of bytes
  1548  	BSWAPQ	CX
  1549  	XORQ	AX, CX
  1550  	BSRQ	CX, CX	// index of highest bit difference
  1551  	SHRQ	CX, AX	// move a's bit to bottom
  1552  	ANDQ	$1, AX	// mask bit
  1553  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1554  	MOVQ	AX, (R9)
  1555  	RET
  1556  
  1557  	// 0-7 bytes in common
  1558  small:
  1559  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1560  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1561  	JEQ	allsame
  1562  
  1563  	// load bytes of a into high bytes of AX
  1564  	CMPB	SI, $0xf8
  1565  	JA	si_high
  1566  	MOVQ	(SI), SI
  1567  	JMP	si_finish
  1568  si_high:
  1569  	MOVQ	-8(SI)(R8*1), SI
  1570  	SHRQ	CX, SI
  1571  si_finish:
  1572  	SHLQ	CX, SI
  1573  
  1574  	// load bytes of b in to high bytes of BX
  1575  	CMPB	DI, $0xf8
  1576  	JA	di_high
  1577  	MOVQ	(DI), DI
  1578  	JMP	di_finish
  1579  di_high:
  1580  	MOVQ	-8(DI)(R8*1), DI
  1581  	SHRQ	CX, DI
  1582  di_finish:
  1583  	SHLQ	CX, DI
  1584  
  1585  	BSWAPQ	SI	// reverse order of bytes
  1586  	BSWAPQ	DI
  1587  	XORQ	SI, DI	// find bit differences
  1588  	JEQ	allsame
  1589  	BSRQ	DI, CX	// index of highest bit difference
  1590  	SHRQ	CX, SI	// move a's bit to bottom
  1591  	ANDQ	$1, SI	// mask bit
  1592  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1593  	MOVQ	AX, (R9)
  1594  	RET
  1595  
  1596  allsame:
  1597  	XORQ	AX, AX
  1598  	XORQ	CX, CX
  1599  	CMPQ	BX, DX
  1600  	SETGT	AX	// 1 if alen > blen
  1601  	SETEQ	CX	// 1 if alen == blen
  1602  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1603  	MOVQ	AX, (R9)
  1604  	RET
  1605  
  1606  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1607  	MOVQ s+0(FP), SI
  1608  	MOVQ s_len+8(FP), BX
  1609  	MOVB c+24(FP), AL
  1610  	LEAQ ret+32(FP), R8
  1611  	JMP  runtime·indexbytebody(SB)
  1612  
  1613  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1614  	MOVQ s+0(FP), SI
  1615  	MOVQ s_len+8(FP), BX
  1616  	MOVB c+16(FP), AL
  1617  	LEAQ ret+24(FP), R8
  1618  	JMP  runtime·indexbytebody(SB)
  1619  
  1620  // input:
  1621  //   SI: data
  1622  //   BX: data len
  1623  //   AL: byte sought
  1624  //   R8: address to put result
  1625  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1626  	MOVQ SI, DI
  1627  
  1628  	CMPQ BX, $16
  1629  	JLT small
  1630  
  1631  	// round up to first 16-byte boundary
  1632  	TESTQ $15, SI
  1633  	JZ aligned
  1634  	MOVQ SI, CX
  1635  	ANDQ $~15, CX
  1636  	ADDQ $16, CX
  1637  
  1638  	// search the beginning
  1639  	SUBQ SI, CX
  1640  	REPN; SCASB
  1641  	JZ success
  1642  
  1643  // DI is 16-byte aligned; get ready to search using SSE instructions
  1644  aligned:
  1645  	// round down to last 16-byte boundary
  1646  	MOVQ BX, R11
  1647  	ADDQ SI, R11
  1648  	ANDQ $~15, R11
  1649  
  1650  	// shuffle X0 around so that each byte contains c
  1651  	MOVD AX, X0
  1652  	PUNPCKLBW X0, X0
  1653  	PUNPCKLBW X0, X0
  1654  	PSHUFL $0, X0, X0
  1655  	JMP condition
  1656  
  1657  sse:
  1658  	// move the next 16-byte chunk of the buffer into X1
  1659  	MOVO (DI), X1
  1660  	// compare bytes in X0 to X1
  1661  	PCMPEQB X0, X1
  1662  	// take the top bit of each byte in X1 and put the result in DX
  1663  	PMOVMSKB X1, DX
  1664  	TESTL DX, DX
  1665  	JNZ ssesuccess
  1666  	ADDQ $16, DI
  1667  
  1668  condition:
  1669  	CMPQ DI, R11
  1670  	JLT sse
  1671  
  1672  	// search the end
  1673  	MOVQ SI, CX
  1674  	ADDQ BX, CX
  1675  	SUBQ R11, CX
  1676  	// if CX == 0, the zero flag will be set and we'll end up
  1677  	// returning a false success
  1678  	JZ failure
  1679  	REPN; SCASB
  1680  	JZ success
  1681  
  1682  failure:
  1683  	MOVQ $-1, (R8)
  1684  	RET
  1685  
  1686  // handle for lengths < 16
  1687  small:
  1688  	MOVQ BX, CX
  1689  	REPN; SCASB
  1690  	JZ success
  1691  	MOVQ $-1, (R8)
  1692  	RET
  1693  
  1694  // we've found the chunk containing the byte
  1695  // now just figure out which specific byte it is
  1696  ssesuccess:
  1697  	// get the index of the least significant set bit
  1698  	BSFW DX, DX
  1699  	SUBQ SI, DI
  1700  	ADDQ DI, DX
  1701  	MOVQ DX, (R8)
  1702  	RET
  1703  
  1704  success:
  1705  	SUBQ SI, DI
  1706  	SUBL $1, DI
  1707  	MOVQ DI, (R8)
  1708  	RET
  1709  
  1710  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  1711  	MOVQ	a_len+8(FP), BX
  1712  	MOVQ	b_len+32(FP), CX
  1713  	CMPQ	BX, CX
  1714  	JNE	eqret
  1715  	MOVQ	a+0(FP), SI
  1716  	MOVQ	b+24(FP), DI
  1717  	LEAQ	ret+48(FP), AX
  1718  	JMP	runtime·memeqbody(SB)
  1719  eqret:
  1720  	MOVB	$0, ret+48(FP)
  1721  	RET
  1722  
  1723  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1724  	get_tls(CX)
  1725  	MOVQ	g(CX), AX
  1726  	MOVQ	g_m(AX), AX
  1727  	MOVL	m_fastrand(AX), DX
  1728  	ADDL	DX, DX
  1729  	MOVL	DX, BX
  1730  	XORL	$0x88888eef, DX
  1731  	CMOVLMI	BX, DX
  1732  	MOVL	DX, m_fastrand(AX)
  1733  	MOVL	DX, ret+0(FP)
  1734  	RET
  1735  
  1736  TEXT runtime·return0(SB), NOSPLIT, $0
  1737  	MOVL	$0, AX
  1738  	RET
  1739  
  1740  
  1741  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1742  // Must obey the gcc calling convention.
  1743  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1744  	get_tls(CX)
  1745  	MOVQ	g(CX), AX
  1746  	MOVQ	g_m(AX), AX
  1747  	MOVQ	m_curg(AX), AX
  1748  	MOVQ	(g_stack+stack_hi)(AX), AX
  1749  	RET
  1750  
  1751  // The top-most function running on a goroutine
  1752  // returns to goexit+PCQuantum.
  1753  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1754  	BYTE	$0x90	// NOP
  1755  	CALL	runtime·goexit1(SB)	// does not return
  1756  	// traceback from goexit1 must hit code range of goexit
  1757  	BYTE	$0x90	// NOP
  1758  
  1759  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  1760  	MOVQ	addr+0(FP), AX
  1761  	PREFETCHT0	(AX)
  1762  	RET
  1763  
  1764  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  1765  	MOVQ	addr+0(FP), AX
  1766  	PREFETCHT1	(AX)
  1767  	RET
  1768  
  1769  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  1770  	MOVQ	addr+0(FP), AX
  1771  	PREFETCHT2	(AX)
  1772  	RET
  1773  
  1774  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  1775  	MOVQ	addr+0(FP), AX
  1776  	PREFETCHNTA	(AX)
  1777  	RET
  1778  
  1779  // This is called from .init_array and follows the platform, not Go, ABI.
  1780  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1781  	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
  1782  	MOVQ	runtime·lastmoduledatap(SB), AX
  1783  	MOVQ	DI, moduledata_next(AX)
  1784  	MOVQ	DI, runtime·lastmoduledatap(SB)
  1785  	POPQ	R15
  1786  	RET