rsc.io/go@v0.0.0-20150416155037-e040fd465409/src/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVQ	DI, AX		// argc
    13  	MOVQ	SI, BX		// argv
    14  	SUBQ	$(4*8+7), SP		// 2args 2auto
    15  	ANDQ	$~15, SP
    16  	MOVQ	AX, 16(SP)
    17  	MOVQ	BX, 24(SP)
    18  	
    19  	// create istack out of the given (operating system) stack.
    20  	// _cgo_init may update stackguard.
    21  	MOVQ	$runtime·g0(SB), DI
    22  	LEAQ	(-64*1024+104)(SP), BX
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	BX, g_stackguard1(DI)
    25  	MOVQ	BX, (g_stack+stack_lo)(DI)
    26  	MOVQ	SP, (g_stack+stack_hi)(DI)
    27  
    28  	// find out information about the processor we're on
    29  	MOVQ	$0, AX
    30  	CPUID
    31  	CMPQ	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  
    46  	MOVQ	$1, AX
    47  	CPUID
    48  	MOVL	CX, runtime·cpuid_ecx(SB)
    49  	MOVL	DX, runtime·cpuid_edx(SB)
    50  nocpuinfo:	
    51  	
    52  	// if there is an _cgo_init, call it.
    53  	MOVQ	_cgo_init(SB), AX
    54  	TESTQ	AX, AX
    55  	JZ	needtls
    56  	// g0 already in DI
    57  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    58  	MOVQ	$setg_gcc<>(SB), SI
    59  	CALL	AX
    60  
    61  	// update stackguard after _cgo_init
    62  	MOVQ	$runtime·g0(SB), CX
    63  	MOVQ	(g_stack+stack_lo)(CX), AX
    64  	ADDQ	$const__StackGuard, AX
    65  	MOVQ	AX, g_stackguard0(CX)
    66  	MOVQ	AX, g_stackguard1(CX)
    67  
    68  	CMPL	runtime·iswindows(SB), $0
    69  	JEQ ok
    70  needtls:
    71  	// skip TLS setup on Plan 9
    72  	CMPL	runtime·isplan9(SB), $1
    73  	JEQ ok
    74  	// skip TLS setup on Solaris
    75  	CMPL	runtime·issolaris(SB), $1
    76  	JEQ ok
    77  
    78  	LEAQ	runtime·tls0(SB), DI
    79  	CALL	runtime·settls(SB)
    80  
    81  	// store through it, to make sure it works
    82  	get_tls(BX)
    83  	MOVQ	$0x123, g(BX)
    84  	MOVQ	runtime·tls0(SB), AX
    85  	CMPQ	AX, $0x123
    86  	JEQ 2(PC)
    87  	MOVL	AX, 0	// abort
    88  ok:
    89  	// set the per-goroutine and per-mach "registers"
    90  	get_tls(BX)
    91  	LEAQ	runtime·g0(SB), CX
    92  	MOVQ	CX, g(BX)
    93  	LEAQ	runtime·m0(SB), AX
    94  
    95  	// save m->g0 = g0
    96  	MOVQ	CX, m_g0(AX)
    97  	// save m0 to g0->m
    98  	MOVQ	AX, g_m(CX)
    99  
   100  	CLD				// convention is D is always left cleared
   101  	CALL	runtime·check(SB)
   102  
   103  	MOVL	16(SP), AX		// copy argc
   104  	MOVL	AX, 0(SP)
   105  	MOVQ	24(SP), AX		// copy argv
   106  	MOVQ	AX, 8(SP)
   107  	CALL	runtime·args(SB)
   108  	CALL	runtime·osinit(SB)
   109  	CALL	runtime·schedinit(SB)
   110  
   111  	// create a new goroutine to start program
   112  	MOVQ	$runtime·mainPC(SB), AX		// entry
   113  	PUSHQ	AX
   114  	PUSHQ	$0			// arg size
   115  	CALL	runtime·newproc(SB)
   116  	POPQ	AX
   117  	POPQ	AX
   118  
   119  	// start this M
   120  	CALL	runtime·mstart(SB)
   121  
   122  	MOVL	$0xf1, 0xf1  // crash
   123  	RET
   124  
   125  DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
   126  GLOBL	runtime·mainPC(SB),RODATA,$8
   127  
   128  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   129  	BYTE	$0xcc
   130  	RET
   131  
   132  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   133  	// No per-thread init.
   134  	RET
   135  
   136  /*
   137   *  go-routine
   138   */
   139  
   140  // void gosave(Gobuf*)
   141  // save state in Gobuf; setjmp
   142  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   143  	MOVQ	buf+0(FP), AX		// gobuf
   144  	LEAQ	buf+0(FP), BX		// caller's SP
   145  	MOVQ	BX, gobuf_sp(AX)
   146  	MOVQ	0(SP), BX		// caller's PC
   147  	MOVQ	BX, gobuf_pc(AX)
   148  	MOVQ	$0, gobuf_ret(AX)
   149  	MOVQ	$0, gobuf_ctxt(AX)
   150  	MOVQ	BP, gobuf_bp(AX)
   151  	get_tls(CX)
   152  	MOVQ	g(CX), BX
   153  	MOVQ	BX, gobuf_g(AX)
   154  	RET
   155  
   156  // void gogo(Gobuf*)
   157  // restore state from Gobuf; longjmp
   158  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   159  	MOVQ	buf+0(FP), BX		// gobuf
   160  	MOVQ	gobuf_g(BX), DX
   161  	MOVQ	0(DX), CX		// make sure g != nil
   162  	get_tls(CX)
   163  	MOVQ	DX, g(CX)
   164  	MOVQ	gobuf_sp(BX), SP	// restore SP
   165  	MOVQ	gobuf_ret(BX), AX
   166  	MOVQ	gobuf_ctxt(BX), DX
   167  	MOVQ	gobuf_bp(BX), BP
   168  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   169  	MOVQ	$0, gobuf_ret(BX)
   170  	MOVQ	$0, gobuf_ctxt(BX)
   171  	MOVQ	$0, gobuf_bp(BX)
   172  	MOVQ	gobuf_pc(BX), BX
   173  	JMP	BX
   174  
   175  // func mcall(fn func(*g))
   176  // Switch to m->g0's stack, call fn(g).
   177  // Fn must never return.  It should gogo(&g->sched)
   178  // to keep running g.
   179  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   180  	MOVQ	fn+0(FP), DI
   181  	
   182  	get_tls(CX)
   183  	MOVQ	g(CX), AX	// save state in g->sched
   184  	MOVQ	0(SP), BX	// caller's PC
   185  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   186  	LEAQ	fn+0(FP), BX	// caller's SP
   187  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   188  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   189  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   190  
   191  	// switch to m->g0 & its stack, call fn
   192  	MOVQ	g(CX), BX
   193  	MOVQ	g_m(BX), BX
   194  	MOVQ	m_g0(BX), SI
   195  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   196  	JNE	3(PC)
   197  	MOVQ	$runtime·badmcall(SB), AX
   198  	JMP	AX
   199  	MOVQ	SI, g(CX)	// g = m->g0
   200  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   201  	PUSHQ	AX
   202  	MOVQ	DI, DX
   203  	MOVQ	0(DI), DI
   204  	CALL	DI
   205  	POPQ	AX
   206  	MOVQ	$runtime·badmcall2(SB), AX
   207  	JMP	AX
   208  	RET
   209  
   210  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   211  // of the G stack.  We need to distinguish the routine that
   212  // lives at the bottom of the G stack from the one that lives
   213  // at the top of the system stack because the one at the top of
   214  // the system stack terminates the stack walk (see topofstack()).
   215  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   216  	RET
   217  
   218  // func systemstack(fn func())
   219  TEXT runtime·systemstack(SB), NOSPLIT, $0-8
   220  	MOVQ	fn+0(FP), DI	// DI = fn
   221  	get_tls(CX)
   222  	MOVQ	g(CX), AX	// AX = g
   223  	MOVQ	g_m(AX), BX	// BX = m
   224  
   225  	MOVQ	m_gsignal(BX), DX	// DX = gsignal
   226  	CMPQ	AX, DX
   227  	JEQ	noswitch
   228  
   229  	MOVQ	m_g0(BX), DX	// DX = g0
   230  	CMPQ	AX, DX
   231  	JEQ	noswitch
   232  
   233  	MOVQ	m_curg(BX), R8
   234  	CMPQ	AX, R8
   235  	JEQ	switch
   236  	
   237  	// Bad: g is not gsignal, not g0, not curg. What is it?
   238  	MOVQ	$runtime·badsystemstack(SB), AX
   239  	CALL	AX
   240  
   241  switch:
   242  	// save our state in g->sched.  Pretend to
   243  	// be systemstack_switch if the G stack is scanned.
   244  	MOVQ	$runtime·systemstack_switch(SB), SI
   245  	MOVQ	SI, (g_sched+gobuf_pc)(AX)
   246  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   247  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   248  	MOVQ	BP, (g_sched+gobuf_bp)(AX)
   249  
   250  	// switch to g0
   251  	MOVQ	DX, g(CX)
   252  	MOVQ	(g_sched+gobuf_sp)(DX), BX
   253  	// make it look like mstart called systemstack on g0, to stop traceback
   254  	SUBQ	$8, BX
   255  	MOVQ	$runtime·mstart(SB), DX
   256  	MOVQ	DX, 0(BX)
   257  	MOVQ	BX, SP
   258  
   259  	// call target function
   260  	MOVQ	DI, DX
   261  	MOVQ	0(DI), DI
   262  	CALL	DI
   263  
   264  	// switch back to g
   265  	get_tls(CX)
   266  	MOVQ	g(CX), AX
   267  	MOVQ	g_m(AX), BX
   268  	MOVQ	m_curg(BX), AX
   269  	MOVQ	AX, g(CX)
   270  	MOVQ	(g_sched+gobuf_sp)(AX), SP
   271  	MOVQ	$0, (g_sched+gobuf_sp)(AX)
   272  	RET
   273  
   274  noswitch:
   275  	// already on m stack, just call directly
   276  	MOVQ	DI, DX
   277  	MOVQ	0(DI), DI
   278  	CALL	DI
   279  	RET
   280  
   281  /*
   282   * support for morestack
   283   */
   284  
   285  // Called during function prolog when more stack is needed.
   286  //
   287  // The traceback routines see morestack on a g0 as being
   288  // the top of a stack (for example, morestack calling newstack
   289  // calling the scheduler calling newm calling gc), so we must
   290  // record an argument size. For that purpose, it has no arguments.
   291  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   292  	// Cannot grow scheduler stack (m->g0).
   293  	get_tls(CX)
   294  	MOVQ	g(CX), BX
   295  	MOVQ	g_m(BX), BX
   296  	MOVQ	m_g0(BX), SI
   297  	CMPQ	g(CX), SI
   298  	JNE	2(PC)
   299  	INT	$3
   300  
   301  	// Cannot grow signal stack (m->gsignal).
   302  	MOVQ	m_gsignal(BX), SI
   303  	CMPQ	g(CX), SI
   304  	JNE	2(PC)
   305  	INT	$3
   306  
   307  	// Called from f.
   308  	// Set m->morebuf to f's caller.
   309  	MOVQ	8(SP), AX	// f's caller's PC
   310  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   311  	LEAQ	16(SP), AX	// f's caller's SP
   312  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   313  	get_tls(CX)
   314  	MOVQ	g(CX), SI
   315  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   316  
   317  	// Set g->sched to context in f.
   318  	MOVQ	0(SP), AX // f's PC
   319  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   320  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   321  	LEAQ	8(SP), AX // f's SP
   322  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   323  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   324  	MOVQ	BP, (g_sched+gobuf_bp)(SI)
   325  
   326  	// Call newstack on m->g0's stack.
   327  	MOVQ	m_g0(BX), BX
   328  	MOVQ	BX, g(CX)
   329  	MOVQ	(g_sched+gobuf_sp)(BX), SP
   330  	CALL	runtime·newstack(SB)
   331  	MOVQ	$0, 0x1003	// crash if newstack returns
   332  	RET
   333  
   334  // morestack but not preserving ctxt.
   335  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   336  	MOVL	$0, DX
   337  	JMP	runtime·morestack(SB)
   338  
   339  // reflectcall: call a function with the given argument list
   340  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   341  // we don't have variable-sized frames, so we use a small number
   342  // of constant-sized-frame functions to encode a few bits of size in the pc.
   343  // Caution: ugly multiline assembly macros in your future!
   344  
   345  #define DISPATCH(NAME,MAXSIZE)		\
   346  	CMPQ	CX, $MAXSIZE;		\
   347  	JA	3(PC);			\
   348  	MOVQ	$NAME(SB), AX;		\
   349  	JMP	AX
   350  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   351  
   352  TEXT reflect·call(SB), NOSPLIT, $0-0
   353  	JMP	·reflectcall(SB)
   354  
   355  TEXT ·reflectcall(SB), NOSPLIT, $0-32
   356  	MOVLQZX argsize+24(FP), CX
   357  	// NOTE(rsc): No call16, because CALLFN needs four words
   358  	// of argument space to invoke callwritebarrier.
   359  	DISPATCH(runtime·call32, 32)
   360  	DISPATCH(runtime·call64, 64)
   361  	DISPATCH(runtime·call128, 128)
   362  	DISPATCH(runtime·call256, 256)
   363  	DISPATCH(runtime·call512, 512)
   364  	DISPATCH(runtime·call1024, 1024)
   365  	DISPATCH(runtime·call2048, 2048)
   366  	DISPATCH(runtime·call4096, 4096)
   367  	DISPATCH(runtime·call8192, 8192)
   368  	DISPATCH(runtime·call16384, 16384)
   369  	DISPATCH(runtime·call32768, 32768)
   370  	DISPATCH(runtime·call65536, 65536)
   371  	DISPATCH(runtime·call131072, 131072)
   372  	DISPATCH(runtime·call262144, 262144)
   373  	DISPATCH(runtime·call524288, 524288)
   374  	DISPATCH(runtime·call1048576, 1048576)
   375  	DISPATCH(runtime·call2097152, 2097152)
   376  	DISPATCH(runtime·call4194304, 4194304)
   377  	DISPATCH(runtime·call8388608, 8388608)
   378  	DISPATCH(runtime·call16777216, 16777216)
   379  	DISPATCH(runtime·call33554432, 33554432)
   380  	DISPATCH(runtime·call67108864, 67108864)
   381  	DISPATCH(runtime·call134217728, 134217728)
   382  	DISPATCH(runtime·call268435456, 268435456)
   383  	DISPATCH(runtime·call536870912, 536870912)
   384  	DISPATCH(runtime·call1073741824, 1073741824)
   385  	MOVQ	$runtime·badreflectcall(SB), AX
   386  	JMP	AX
   387  
   388  #define CALLFN(NAME,MAXSIZE)			\
   389  TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
   390  	NO_LOCAL_POINTERS;			\
   391  	/* copy arguments to stack */		\
   392  	MOVQ	argptr+16(FP), SI;		\
   393  	MOVLQZX argsize+24(FP), CX;		\
   394  	MOVQ	SP, DI;				\
   395  	REP;MOVSB;				\
   396  	/* call function */			\
   397  	MOVQ	f+8(FP), DX;			\
   398  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   399  	CALL	(DX);				\
   400  	/* copy return values back */		\
   401  	MOVQ	argptr+16(FP), DI;		\
   402  	MOVLQZX	argsize+24(FP), CX;		\
   403  	MOVLQZX retoffset+28(FP), BX;		\
   404  	MOVQ	SP, SI;				\
   405  	ADDQ	BX, DI;				\
   406  	ADDQ	BX, SI;				\
   407  	SUBQ	BX, CX;				\
   408  	REP;MOVSB;				\
   409  	/* execute write barrier updates */	\
   410  	MOVQ	argtype+0(FP), DX;		\
   411  	MOVQ	argptr+16(FP), DI;		\
   412  	MOVLQZX	argsize+24(FP), CX;		\
   413  	MOVLQZX retoffset+28(FP), BX;		\
   414  	MOVQ	DX, 0(SP);			\
   415  	MOVQ	DI, 8(SP);			\
   416  	MOVQ	CX, 16(SP);			\
   417  	MOVQ	BX, 24(SP);			\
   418  	CALL	runtime·callwritebarrier(SB);	\
   419  	RET
   420  
   421  CALLFN(·call32, 32)
   422  CALLFN(·call64, 64)
   423  CALLFN(·call128, 128)
   424  CALLFN(·call256, 256)
   425  CALLFN(·call512, 512)
   426  CALLFN(·call1024, 1024)
   427  CALLFN(·call2048, 2048)
   428  CALLFN(·call4096, 4096)
   429  CALLFN(·call8192, 8192)
   430  CALLFN(·call16384, 16384)
   431  CALLFN(·call32768, 32768)
   432  CALLFN(·call65536, 65536)
   433  CALLFN(·call131072, 131072)
   434  CALLFN(·call262144, 262144)
   435  CALLFN(·call524288, 524288)
   436  CALLFN(·call1048576, 1048576)
   437  CALLFN(·call2097152, 2097152)
   438  CALLFN(·call4194304, 4194304)
   439  CALLFN(·call8388608, 8388608)
   440  CALLFN(·call16777216, 16777216)
   441  CALLFN(·call33554432, 33554432)
   442  CALLFN(·call67108864, 67108864)
   443  CALLFN(·call134217728, 134217728)
   444  CALLFN(·call268435456, 268435456)
   445  CALLFN(·call536870912, 536870912)
   446  CALLFN(·call1073741824, 1073741824)
   447  
   448  // bool cas(int32 *val, int32 old, int32 new)
   449  // Atomically:
   450  //	if(*val == old){
   451  //		*val = new;
   452  //		return 1;
   453  //	} else
   454  //		return 0;
   455  TEXT runtime·cas(SB), NOSPLIT, $0-17
   456  	MOVQ	ptr+0(FP), BX
   457  	MOVL	old+8(FP), AX
   458  	MOVL	new+12(FP), CX
   459  	LOCK
   460  	CMPXCHGL	CX, 0(BX)
   461  	SETEQ	ret+16(FP)
   462  	RET
   463  
   464  // bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
   465  // Atomically:
   466  //	if(*val == *old){
   467  //		*val = new;
   468  //		return 1;
   469  //	} else {
   470  //		return 0;
   471  //	}
   472  TEXT runtime·cas64(SB), NOSPLIT, $0-25
   473  	MOVQ	ptr+0(FP), BX
   474  	MOVQ	old+8(FP), AX
   475  	MOVQ	new+16(FP), CX
   476  	LOCK
   477  	CMPXCHGQ	CX, 0(BX)
   478  	SETEQ	ret+24(FP)
   479  	RET
   480  	
   481  TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
   482  	JMP	runtime·cas64(SB)
   483  
   484  TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16
   485  	JMP	runtime·atomicload64(SB)
   486  
   487  TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16
   488  	JMP	runtime·atomicload64(SB)
   489  
   490  TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
   491  	JMP	runtime·atomicstore64(SB)
   492  
   493  // bool casp(void **val, void *old, void *new)
   494  // Atomically:
   495  //	if(*val == old){
   496  //		*val = new;
   497  //		return 1;
   498  //	} else
   499  //		return 0;
   500  TEXT runtime·casp1(SB), NOSPLIT, $0-25
   501  	MOVQ	ptr+0(FP), BX
   502  	MOVQ	old+8(FP), AX
   503  	MOVQ	new+16(FP), CX
   504  	LOCK
   505  	CMPXCHGQ	CX, 0(BX)
   506  	SETEQ	ret+24(FP)
   507  	RET
   508  
   509  // uint32 xadd(uint32 volatile *val, int32 delta)
   510  // Atomically:
   511  //	*val += delta;
   512  //	return *val;
   513  TEXT runtime·xadd(SB), NOSPLIT, $0-20
   514  	MOVQ	ptr+0(FP), BX
   515  	MOVL	delta+8(FP), AX
   516  	MOVL	AX, CX
   517  	LOCK
   518  	XADDL	AX, 0(BX)
   519  	ADDL	CX, AX
   520  	MOVL	AX, ret+16(FP)
   521  	RET
   522  
   523  TEXT runtime·xadd64(SB), NOSPLIT, $0-24
   524  	MOVQ	ptr+0(FP), BX
   525  	MOVQ	delta+8(FP), AX
   526  	MOVQ	AX, CX
   527  	LOCK
   528  	XADDQ	AX, 0(BX)
   529  	ADDQ	CX, AX
   530  	MOVQ	AX, ret+16(FP)
   531  	RET
   532  
   533  TEXT runtime·xchg(SB), NOSPLIT, $0-20
   534  	MOVQ	ptr+0(FP), BX
   535  	MOVL	new+8(FP), AX
   536  	XCHGL	AX, 0(BX)
   537  	MOVL	AX, ret+16(FP)
   538  	RET
   539  
   540  TEXT runtime·xchg64(SB), NOSPLIT, $0-24
   541  	MOVQ	ptr+0(FP), BX
   542  	MOVQ	new+8(FP), AX
   543  	XCHGQ	AX, 0(BX)
   544  	MOVQ	AX, ret+16(FP)
   545  	RET
   546  
   547  TEXT runtime·xchgp1(SB), NOSPLIT, $0-24
   548  	MOVQ	ptr+0(FP), BX
   549  	MOVQ	new+8(FP), AX
   550  	XCHGQ	AX, 0(BX)
   551  	MOVQ	AX, ret+16(FP)
   552  	RET
   553  
   554  TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
   555  	JMP	runtime·xchg64(SB)
   556  
   557  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   558  	MOVL	cycles+0(FP), AX
   559  again:
   560  	PAUSE
   561  	SUBL	$1, AX
   562  	JNZ	again
   563  	RET
   564  
   565  TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
   566  	MOVQ	ptr+0(FP), BX
   567  	MOVQ	val+8(FP), AX
   568  	XCHGQ	AX, 0(BX)
   569  	RET
   570  
   571  TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
   572  	MOVQ	ptr+0(FP), BX
   573  	MOVL	val+8(FP), AX
   574  	XCHGL	AX, 0(BX)
   575  	RET
   576  
   577  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
   578  	MOVQ	ptr+0(FP), BX
   579  	MOVQ	val+8(FP), AX
   580  	XCHGQ	AX, 0(BX)
   581  	RET
   582  
   583  // void	runtime·atomicor8(byte volatile*, byte);
   584  TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
   585  	MOVQ	ptr+0(FP), AX
   586  	MOVB	val+8(FP), BX
   587  	LOCK
   588  	ORB	BX, (AX)
   589  	RET
   590  
   591  // void	runtime·atomicand8(byte volatile*, byte);
   592  TEXT runtime·atomicand8(SB), NOSPLIT, $0-9
   593  	MOVQ	ptr+0(FP), AX
   594  	MOVB	val+8(FP), BX
   595  	LOCK
   596  	ANDB	BX, (AX)
   597  	RET
   598  
   599  // void jmpdefer(fn, sp);
   600  // called from deferreturn.
   601  // 1. pop the caller
   602  // 2. sub 5 bytes from the callers return
   603  // 3. jmp to the argument
   604  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   605  	MOVQ	fv+0(FP), DX	// fn
   606  	MOVQ	argp+8(FP), BX	// caller sp
   607  	LEAQ	-8(BX), SP	// caller sp after CALL
   608  	SUBQ	$5, (SP)	// return to CALL again
   609  	MOVQ	0(DX), BX
   610  	JMP	BX	// but first run the deferred function
   611  
   612  // Save state of caller into g->sched. Smashes R8, R9.
   613  TEXT gosave<>(SB),NOSPLIT,$0
   614  	get_tls(R8)
   615  	MOVQ	g(R8), R8
   616  	MOVQ	0(SP), R9
   617  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   618  	LEAQ	8(SP), R9
   619  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   620  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   621  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   622  	MOVQ	BP, (g_sched+gobuf_bp)(R8)
   623  	RET
   624  
   625  // asmcgocall(void(*fn)(void*), void *arg)
   626  // Call fn(arg) on the scheduler stack,
   627  // aligned appropriately for the gcc ABI.
   628  // See cgocall.c for more details.
   629  TEXT ·asmcgocall(SB),NOSPLIT,$0-16
   630  	MOVQ	fn+0(FP), AX
   631  	MOVQ	arg+8(FP), BX
   632  	CALL	asmcgocall<>(SB)
   633  	RET
   634  
   635  TEXT ·asmcgocall_errno(SB),NOSPLIT,$0-20
   636  	MOVQ	fn+0(FP), AX
   637  	MOVQ	arg+8(FP), BX
   638  	CALL	asmcgocall<>(SB)
   639  	MOVL	AX, ret+16(FP)
   640  	RET
   641  
   642  // asmcgocall common code. fn in AX, arg in BX. returns errno in AX.
   643  TEXT asmcgocall<>(SB),NOSPLIT,$0-0
   644  	MOVQ	SP, DX
   645  
   646  	// Figure out if we need to switch to m->g0 stack.
   647  	// We get called to create new OS threads too, and those
   648  	// come in on the m->g0 stack already.
   649  	get_tls(CX)
   650  	MOVQ	g(CX), R8
   651  	MOVQ	g_m(R8), R8
   652  	MOVQ	m_g0(R8), SI
   653  	MOVQ	g(CX), DI
   654  	CMPQ	SI, DI
   655  	JEQ	nosave
   656  	MOVQ	m_gsignal(R8), SI
   657  	CMPQ	SI, DI
   658  	JEQ	nosave
   659  	
   660  	MOVQ	m_g0(R8), SI
   661  	CALL	gosave<>(SB)
   662  	MOVQ	SI, g(CX)
   663  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   664  nosave:
   665  
   666  	// Now on a scheduling stack (a pthread-created stack).
   667  	// Make sure we have enough room for 4 stack-backed fast-call
   668  	// registers as per windows amd64 calling convention.
   669  	SUBQ	$64, SP
   670  	ANDQ	$~15, SP	// alignment for gcc ABI
   671  	MOVQ	DI, 48(SP)	// save g
   672  	MOVQ	(g_stack+stack_hi)(DI), DI
   673  	SUBQ	DX, DI
   674  	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   675  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   676  	MOVQ	BX, CX		// CX = first argument in Win64
   677  	CALL	AX
   678  
   679  	// Restore registers, g, stack pointer.
   680  	get_tls(CX)
   681  	MOVQ	48(SP), DI
   682  	MOVQ	(g_stack+stack_hi)(DI), SI
   683  	SUBQ	40(SP), SI
   684  	MOVQ	DI, g(CX)
   685  	MOVQ	SI, SP
   686  	RET
   687  
   688  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   689  // Turn the fn into a Go func (by taking its address) and call
   690  // cgocallback_gofunc.
   691  TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
   692  	LEAQ	fn+0(FP), AX
   693  	MOVQ	AX, 0(SP)
   694  	MOVQ	frame+8(FP), AX
   695  	MOVQ	AX, 8(SP)
   696  	MOVQ	framesize+16(FP), AX
   697  	MOVQ	AX, 16(SP)
   698  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   699  	CALL	AX
   700  	RET
   701  
   702  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   703  // See cgocall.c for more details.
   704  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
   705  	NO_LOCAL_POINTERS
   706  
   707  	// If g is nil, Go did not create the current thread.
   708  	// Call needm to obtain one m for temporary use.
   709  	// In this case, we're running on the thread stack, so there's
   710  	// lots of space, but the linker doesn't know. Hide the call from
   711  	// the linker analysis by using an indirect call through AX.
   712  	get_tls(CX)
   713  #ifdef GOOS_windows
   714  	MOVL	$0, BX
   715  	CMPQ	CX, $0
   716  	JEQ	2(PC)
   717  #endif
   718  	MOVQ	g(CX), BX
   719  	CMPQ	BX, $0
   720  	JEQ	needm
   721  	MOVQ	g_m(BX), BX
   722  	MOVQ	BX, R8 // holds oldm until end of function
   723  	JMP	havem
   724  needm:
   725  	MOVQ	$0, 0(SP)
   726  	MOVQ	$runtime·needm(SB), AX
   727  	CALL	AX
   728  	MOVQ	0(SP), R8
   729  	get_tls(CX)
   730  	MOVQ	g(CX), BX
   731  	MOVQ	g_m(BX), BX
   732  	
   733  	// Set m->sched.sp = SP, so that if a panic happens
   734  	// during the function we are about to execute, it will
   735  	// have a valid SP to run on the g0 stack.
   736  	// The next few lines (after the havem label)
   737  	// will save this SP onto the stack and then write
   738  	// the same SP back to m->sched.sp. That seems redundant,
   739  	// but if an unrecovered panic happens, unwindm will
   740  	// restore the g->sched.sp from the stack location
   741  	// and then systemstack will try to use it. If we don't set it here,
   742  	// that restored SP will be uninitialized (typically 0) and
   743  	// will not be usable.
   744  	MOVQ	m_g0(BX), SI
   745  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   746  
   747  havem:
   748  	// Now there's a valid m, and we're running on its m->g0.
   749  	// Save current m->g0->sched.sp on stack and then set it to SP.
   750  	// Save current sp in m->g0->sched.sp in preparation for
   751  	// switch back to m->curg stack.
   752  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   753  	MOVQ	m_g0(BX), SI
   754  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   755  	MOVQ	AX, 0(SP)
   756  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   757  
   758  	// Switch to m->curg stack and call runtime.cgocallbackg.
   759  	// Because we are taking over the execution of m->curg
   760  	// but *not* resuming what had been running, we need to
   761  	// save that information (m->curg->sched) so we can restore it.
   762  	// We can restore m->curg->sched.sp easily, because calling
   763  	// runtime.cgocallbackg leaves SP unchanged upon return.
   764  	// To save m->curg->sched.pc, we push it onto the stack.
   765  	// This has the added benefit that it looks to the traceback
   766  	// routine like cgocallbackg is going to return to that
   767  	// PC (because the frame we allocate below has the same
   768  	// size as cgocallback_gofunc's frame declared above)
   769  	// so that the traceback will seamlessly trace back into
   770  	// the earlier calls.
   771  	//
   772  	// In the new goroutine, 0(SP) holds the saved R8.
   773  	MOVQ	m_curg(BX), SI
   774  	MOVQ	SI, g(CX)
   775  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   776  	MOVQ	(g_sched+gobuf_pc)(SI), BX
   777  	MOVQ	BX, -8(DI)
   778  	// Compute the size of the frame, including return PC and, if
   779  	// GOEXPERIMENT=framepointer, the saved based pointer
   780  	LEAQ	fv+0(FP), AX
   781  	SUBQ	SP, AX
   782  	SUBQ	AX, DI
   783  	MOVQ	DI, SP
   784  
   785  	MOVQ	R8, 0(SP)
   786  	CALL	runtime·cgocallbackg(SB)
   787  	MOVQ	0(SP), R8
   788  
   789  	// Compute the size of the frame again.  FP and SP have
   790  	// completely different values here than they did above,
   791  	// but only their difference matters.
   792  	LEAQ	fv+0(FP), AX
   793  	SUBQ	SP, AX
   794  
   795  	// Restore g->sched (== m->curg->sched) from saved values.
   796  	get_tls(CX)
   797  	MOVQ	g(CX), SI
   798  	MOVQ	SP, DI
   799  	ADDQ	AX, DI
   800  	MOVQ	-8(DI), BX
   801  	MOVQ	BX, (g_sched+gobuf_pc)(SI)
   802  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   803  
   804  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   805  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   806  	// so we do not have to restore it.)
   807  	MOVQ	g(CX), BX
   808  	MOVQ	g_m(BX), BX
   809  	MOVQ	m_g0(BX), SI
   810  	MOVQ	SI, g(CX)
   811  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   812  	MOVQ	0(SP), AX
   813  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   814  	
   815  	// If the m on entry was nil, we called needm above to borrow an m
   816  	// for the duration of the call. Since the call is over, return it with dropm.
   817  	CMPQ	R8, $0
   818  	JNE 3(PC)
   819  	MOVQ	$runtime·dropm(SB), AX
   820  	CALL	AX
   821  
   822  	// Done!
   823  	RET
   824  
   825  // void setg(G*); set g. for use by needm.
   826  TEXT runtime·setg(SB), NOSPLIT, $0-8
   827  	MOVQ	gg+0(FP), BX
   828  #ifdef GOOS_windows
   829  	CMPQ	BX, $0
   830  	JNE	settls
   831  	MOVQ	$0, 0x28(GS)
   832  	RET
   833  settls:
   834  	MOVQ	g_m(BX), AX
   835  	LEAQ	m_tls(AX), AX
   836  	MOVQ	AX, 0x28(GS)
   837  #endif
   838  	get_tls(CX)
   839  	MOVQ	BX, g(CX)
   840  	RET
   841  
   842  // void setg_gcc(G*); set g called from gcc.
   843  TEXT setg_gcc<>(SB),NOSPLIT,$0
   844  	get_tls(AX)
   845  	MOVQ	DI, g(AX)
   846  	RET
   847  
   848  // check that SP is in range [g->stack.lo, g->stack.hi)
   849  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   850  	get_tls(CX)
   851  	MOVQ	g(CX), AX
   852  	CMPQ	(g_stack+stack_hi)(AX), SP
   853  	JHI	2(PC)
   854  	INT	$3
   855  	CMPQ	SP, (g_stack+stack_lo)(AX)
   856  	JHI	2(PC)
   857  	INT	$3
   858  	RET
   859  
   860  TEXT runtime·getcallerpc(SB),NOSPLIT,$0-16
   861  	MOVQ	argp+0(FP),AX		// addr of first arg
   862  	MOVQ	-8(AX),AX		// get calling pc
   863  	MOVQ	AX, ret+8(FP)
   864  	RET
   865  
   866  TEXT runtime·setcallerpc(SB),NOSPLIT,$0-16
   867  	MOVQ	argp+0(FP),AX		// addr of first arg
   868  	MOVQ	pc+8(FP), BX
   869  	MOVQ	BX, -8(AX)		// set calling pc
   870  	RET
   871  
   872  TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
   873  	MOVQ	argp+0(FP), AX
   874  	MOVQ	AX, ret+8(FP)
   875  	RET
   876  
   877  // func cputicks() int64
   878  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   879  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   880  	JNE	mfence
   881  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   882  	JMP	done
   883  mfence:
   884  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   885  done:
   886  	RDTSC
   887  	SHLQ	$32, DX
   888  	ADDQ	DX, AX
   889  	MOVQ	AX, ret+0(FP)
   890  	RET
   891  
   892  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   893  // redirects to memhash(p, h, size) using the size
   894  // stored in the closure.
   895  TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
   896  	GO_ARGS
   897  	NO_LOCAL_POINTERS
   898  	MOVQ	p+0(FP), AX
   899  	MOVQ	h+8(FP), BX
   900  	MOVQ	8(DX), CX
   901  	MOVQ	AX, 0(SP)
   902  	MOVQ	BX, 8(SP)
   903  	MOVQ	CX, 16(SP)
   904  	CALL	runtime·memhash(SB)
   905  	MOVQ	24(SP), AX
   906  	MOVQ	AX, ret+16(FP)
   907  	RET
   908  
   909  // hash function using AES hardware instructions
   910  TEXT runtime·aeshash(SB),NOSPLIT,$0-32
   911  	MOVQ	p+0(FP), AX	// ptr to data
   912  	MOVQ	s+16(FP), CX	// size
   913  	LEAQ	ret+24(FP), DX
   914  	JMP	runtime·aeshashbody(SB)
   915  
   916  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   917  	MOVQ	p+0(FP), AX	// ptr to string struct
   918  	MOVQ	8(AX), CX	// length of string
   919  	MOVQ	(AX), AX	// string data
   920  	LEAQ	ret+16(FP), DX
   921  	JMP	runtime·aeshashbody(SB)
   922  
   923  // AX: data
   924  // CX: length
   925  // DX: address to put return value
   926  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   927  	MOVQ	h+8(FP), X6	// seed to low 64 bits of xmm6
   928  	PINSRQ	$1, CX, X6	// size to high 64 bits of xmm6
   929  	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
   930  	MOVO	runtime·aeskeysched(SB), X7
   931  	CMPQ	CX, $16
   932  	JB	aes0to15
   933  	JE	aes16
   934  	CMPQ	CX, $32
   935  	JBE	aes17to32
   936  	CMPQ	CX, $64
   937  	JBE	aes33to64
   938  	CMPQ	CX, $128
   939  	JBE	aes65to128
   940  	JMP	aes129plus
   941  
   942  aes0to15:
   943  	TESTQ	CX, CX
   944  	JE	aes0
   945  
   946  	ADDQ	$16, AX
   947  	TESTW	$0xff0, AX
   948  	JE	endofpage
   949  
   950  	// 16 bytes loaded at this address won't cross
   951  	// a page boundary, so we can load it directly.
   952  	MOVOU	-16(AX), X0
   953  	ADDQ	CX, CX
   954  	MOVQ	$masks<>(SB), AX
   955  	PAND	(AX)(CX*8), X0
   956  
   957  	// scramble 3 times
   958  	AESENC	X6, X0
   959  	AESENC	X7, X0
   960  	AESENC	X7, X0
   961  	MOVQ	X0, (DX)
   962  	RET
   963  
   964  endofpage:
   965  	// address ends in 1111xxxx.  Might be up against
   966  	// a page boundary, so load ending at last byte.
   967  	// Then shift bytes down using pshufb.
   968  	MOVOU	-32(AX)(CX*1), X0
   969  	ADDQ	CX, CX
   970  	MOVQ	$shifts<>(SB), AX
   971  	PSHUFB	(AX)(CX*8), X0
   972  	AESENC	X6, X0
   973  	AESENC	X7, X0
   974  	AESENC	X7, X0
   975  	MOVQ	X0, (DX)
   976  	RET
   977  
   978  aes0:
   979  	// return input seed
   980  	MOVQ	h+8(FP), AX
   981  	MOVQ	AX, (DX)
   982  	RET
   983  
   984  aes16:
   985  	MOVOU	(AX), X0
   986  	AESENC	X6, X0
   987  	AESENC	X7, X0
   988  	AESENC	X7, X0
   989  	MOVQ	X0, (DX)
   990  	RET
   991  
   992  aes17to32:
   993  	// load data to be hashed
   994  	MOVOU	(AX), X0
   995  	MOVOU	-16(AX)(CX*1), X1
   996  
   997  	// scramble 3 times
   998  	AESENC	X6, X0
   999  	AESENC	runtime·aeskeysched+16(SB), X1
  1000  	AESENC	X7, X0
  1001  	AESENC	X7, X1
  1002  	AESENC	X7, X0
  1003  	AESENC	X7, X1
  1004  
  1005  	// combine results
  1006  	PXOR	X1, X0
  1007  	MOVQ	X0, (DX)
  1008  	RET
  1009  
  1010  aes33to64:
  1011  	MOVOU	(AX), X0
  1012  	MOVOU	16(AX), X1
  1013  	MOVOU	-32(AX)(CX*1), X2
  1014  	MOVOU	-16(AX)(CX*1), X3
  1015  	
  1016  	AESENC	X6, X0
  1017  	AESENC	runtime·aeskeysched+16(SB), X1
  1018  	AESENC	runtime·aeskeysched+32(SB), X2
  1019  	AESENC	runtime·aeskeysched+48(SB), X3
  1020  	AESENC	X7, X0
  1021  	AESENC	X7, X1
  1022  	AESENC	X7, X2
  1023  	AESENC	X7, X3
  1024  	AESENC	X7, X0
  1025  	AESENC	X7, X1
  1026  	AESENC	X7, X2
  1027  	AESENC	X7, X3
  1028  
  1029  	PXOR	X2, X0
  1030  	PXOR	X3, X1
  1031  	PXOR	X1, X0
  1032  	MOVQ	X0, (DX)
  1033  	RET
  1034  
  1035  aes65to128:
  1036  	MOVOU	(AX), X0
  1037  	MOVOU	16(AX), X1
  1038  	MOVOU	32(AX), X2
  1039  	MOVOU	48(AX), X3
  1040  	MOVOU	-64(AX)(CX*1), X4
  1041  	MOVOU	-48(AX)(CX*1), X5
  1042  	MOVOU	-32(AX)(CX*1), X8
  1043  	MOVOU	-16(AX)(CX*1), X9
  1044  	
  1045  	AESENC	X6, X0
  1046  	AESENC	runtime·aeskeysched+16(SB), X1
  1047  	AESENC	runtime·aeskeysched+32(SB), X2
  1048  	AESENC	runtime·aeskeysched+48(SB), X3
  1049  	AESENC	runtime·aeskeysched+64(SB), X4
  1050  	AESENC	runtime·aeskeysched+80(SB), X5
  1051  	AESENC	runtime·aeskeysched+96(SB), X8
  1052  	AESENC	runtime·aeskeysched+112(SB), X9
  1053  	AESENC	X7, X0
  1054  	AESENC	X7, X1
  1055  	AESENC	X7, X2
  1056  	AESENC	X7, X3
  1057  	AESENC	X7, X4
  1058  	AESENC	X7, X5
  1059  	AESENC	X7, X8
  1060  	AESENC	X7, X9
  1061  	AESENC	X7, X0
  1062  	AESENC	X7, X1
  1063  	AESENC	X7, X2
  1064  	AESENC	X7, X3
  1065  	AESENC	X7, X4
  1066  	AESENC	X7, X5
  1067  	AESENC	X7, X8
  1068  	AESENC	X7, X9
  1069  
  1070  	PXOR	X4, X0
  1071  	PXOR	X5, X1
  1072  	PXOR	X8, X2
  1073  	PXOR	X9, X3
  1074  	PXOR	X2, X0
  1075  	PXOR	X3, X1
  1076  	PXOR	X1, X0
  1077  	MOVQ	X0, (DX)
  1078  	RET
  1079  
  1080  aes129plus:
  1081  	// start with last (possibly overlapping) block
  1082  	MOVOU	-128(AX)(CX*1), X0
  1083  	MOVOU	-112(AX)(CX*1), X1
  1084  	MOVOU	-96(AX)(CX*1), X2
  1085  	MOVOU	-80(AX)(CX*1), X3
  1086  	MOVOU	-64(AX)(CX*1), X4
  1087  	MOVOU	-48(AX)(CX*1), X5
  1088  	MOVOU	-32(AX)(CX*1), X8
  1089  	MOVOU	-16(AX)(CX*1), X9
  1090  
  1091  	// scramble state once
  1092  	AESENC	X6, X0
  1093  	AESENC	runtime·aeskeysched+16(SB), X1
  1094  	AESENC	runtime·aeskeysched+32(SB), X2
  1095  	AESENC	runtime·aeskeysched+48(SB), X3
  1096  	AESENC	runtime·aeskeysched+64(SB), X4
  1097  	AESENC	runtime·aeskeysched+80(SB), X5
  1098  	AESENC	runtime·aeskeysched+96(SB), X8
  1099  	AESENC	runtime·aeskeysched+112(SB), X9
  1100  
  1101  	// compute number of remaining 128-byte blocks
  1102  	DECQ	CX
  1103  	SHRQ	$7, CX
  1104  	
  1105  aesloop:
  1106  	// scramble state, xor in a block
  1107  	MOVOU	(AX), X10
  1108  	MOVOU	16(AX), X11
  1109  	MOVOU	32(AX), X12
  1110  	MOVOU	48(AX), X13
  1111  	AESENC	X10, X0
  1112  	AESENC	X11, X1
  1113  	AESENC	X12, X2
  1114  	AESENC	X13, X3
  1115  	MOVOU	64(AX), X10
  1116  	MOVOU	80(AX), X11
  1117  	MOVOU	96(AX), X12
  1118  	MOVOU	112(AX), X13
  1119  	AESENC	X10, X4
  1120  	AESENC	X11, X5
  1121  	AESENC	X12, X8
  1122  	AESENC	X13, X9
  1123  
  1124  	// scramble state
  1125  	AESENC	X7, X0
  1126  	AESENC	X7, X1
  1127  	AESENC	X7, X2
  1128  	AESENC	X7, X3
  1129  	AESENC	X7, X4
  1130  	AESENC	X7, X5
  1131  	AESENC	X7, X8
  1132  	AESENC	X7, X9
  1133  
  1134  	ADDQ	$128, AX
  1135  	DECQ	CX
  1136  	JNE	aesloop
  1137  
  1138  	// 2 more scrambles to finish
  1139  	AESENC	X7, X0
  1140  	AESENC	X7, X1
  1141  	AESENC	X7, X2
  1142  	AESENC	X7, X3
  1143  	AESENC	X7, X4
  1144  	AESENC	X7, X5
  1145  	AESENC	X7, X8
  1146  	AESENC	X7, X9
  1147  	AESENC	X7, X0
  1148  	AESENC	X7, X1
  1149  	AESENC	X7, X2
  1150  	AESENC	X7, X3
  1151  	AESENC	X7, X4
  1152  	AESENC	X7, X5
  1153  	AESENC	X7, X8
  1154  	AESENC	X7, X9
  1155  
  1156  	PXOR	X4, X0
  1157  	PXOR	X5, X1
  1158  	PXOR	X8, X2
  1159  	PXOR	X9, X3
  1160  	PXOR	X2, X0
  1161  	PXOR	X3, X1
  1162  	PXOR	X1, X0
  1163  	MOVQ	X0, (DX)
  1164  	RET
  1165  	
  1166  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
  1167  	MOVQ	p+0(FP), AX	// ptr to data
  1168  	MOVQ	h+8(FP), X0	// seed
  1169  	PINSRD	$2, (AX), X0	// data
  1170  	AESENC	runtime·aeskeysched+0(SB), X0
  1171  	AESENC	runtime·aeskeysched+16(SB), X0
  1172  	AESENC	runtime·aeskeysched+32(SB), X0
  1173  	MOVQ	X0, ret+16(FP)
  1174  	RET
  1175  
  1176  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
  1177  	MOVQ	p+0(FP), AX	// ptr to data
  1178  	MOVQ	h+8(FP), X0	// seed
  1179  	PINSRQ	$1, (AX), X0	// data
  1180  	AESENC	runtime·aeskeysched+0(SB), X0
  1181  	AESENC	runtime·aeskeysched+16(SB), X0
  1182  	AESENC	runtime·aeskeysched+32(SB), X0
  1183  	MOVQ	X0, ret+16(FP)
  1184  	RET
  1185  
  1186  // simple mask to get rid of data in the high part of the register.
  1187  DATA masks<>+0x00(SB)/8, $0x0000000000000000
  1188  DATA masks<>+0x08(SB)/8, $0x0000000000000000
  1189  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
  1190  DATA masks<>+0x18(SB)/8, $0x0000000000000000
  1191  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
  1192  DATA masks<>+0x28(SB)/8, $0x0000000000000000
  1193  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
  1194  DATA masks<>+0x38(SB)/8, $0x0000000000000000
  1195  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
  1196  DATA masks<>+0x48(SB)/8, $0x0000000000000000
  1197  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
  1198  DATA masks<>+0x58(SB)/8, $0x0000000000000000
  1199  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
  1200  DATA masks<>+0x68(SB)/8, $0x0000000000000000
  1201  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
  1202  DATA masks<>+0x78(SB)/8, $0x0000000000000000
  1203  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
  1204  DATA masks<>+0x88(SB)/8, $0x0000000000000000
  1205  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
  1206  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
  1207  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
  1208  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
  1209  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
  1210  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
  1211  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
  1212  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
  1213  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
  1214  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
  1215  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
  1216  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
  1217  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
  1218  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
  1219  GLOBL masks<>(SB),RODATA,$256
  1220  
  1221  // these are arguments to pshufb.  They move data down from
  1222  // the high bytes of the register to the low bytes of the register.
  1223  // index is how many bytes to move.
  1224  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
  1225  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
  1226  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
  1227  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
  1228  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
  1229  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
  1230  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
  1231  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
  1232  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
  1233  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
  1234  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
  1235  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
  1236  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
  1237  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
  1238  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
  1239  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
  1240  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
  1241  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
  1242  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
  1243  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
  1244  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
  1245  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
  1246  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
  1247  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
  1248  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
  1249  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
  1250  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
  1251  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
  1252  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
  1253  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
  1254  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  1255  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  1256  GLOBL shifts<>(SB),RODATA,$256
  1257  
  1258  TEXT runtime·memeq(SB),NOSPLIT,$0-25
  1259  	MOVQ	a+0(FP), SI
  1260  	MOVQ	b+8(FP), DI
  1261  	MOVQ	size+16(FP), BX
  1262  	CALL	runtime·memeqbody(SB)
  1263  	MOVB	AX, ret+24(FP)
  1264  	RET
  1265  
  1266  // memequal_varlen(a, b unsafe.Pointer) bool
  1267  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
  1268  	MOVQ	a+0(FP), SI
  1269  	MOVQ	b+8(FP), DI
  1270  	CMPQ	SI, DI
  1271  	JEQ	eq
  1272  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
  1273  	CALL	runtime·memeqbody(SB)
  1274  	MOVB	AX, ret+16(FP)
  1275  	RET
  1276  eq:
  1277  	MOVB	$1, ret+16(FP)
  1278  	RET
  1279  
  1280  // eqstring tests whether two strings are equal.
  1281  // The compiler guarantees that strings passed
  1282  // to eqstring have equal length.
  1283  // See runtime_test.go:eqstring_generic for
  1284  // equivalent Go code.
  1285  TEXT runtime·eqstring(SB),NOSPLIT,$0-33
  1286  	MOVQ	s1str+0(FP), SI
  1287  	MOVQ	s2str+16(FP), DI
  1288  	CMPQ	SI, DI
  1289  	JEQ	eq
  1290  	MOVQ	s1len+8(FP), BX
  1291  	CALL	runtime·memeqbody(SB)
  1292  	MOVB	AX, v+32(FP)
  1293  	RET
  1294  eq:
  1295  	MOVB	$1, v+32(FP)
  1296  	RET
  1297  
  1298  // a in SI
  1299  // b in DI
  1300  // count in BX
  1301  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1302  	XORQ	AX, AX
  1303  
  1304  	CMPQ	BX, $8
  1305  	JB	small
  1306  	
  1307  	// 64 bytes at a time using xmm registers
  1308  hugeloop:
  1309  	CMPQ	BX, $64
  1310  	JB	bigloop
  1311  	MOVOU	(SI), X0
  1312  	MOVOU	(DI), X1
  1313  	MOVOU	16(SI), X2
  1314  	MOVOU	16(DI), X3
  1315  	MOVOU	32(SI), X4
  1316  	MOVOU	32(DI), X5
  1317  	MOVOU	48(SI), X6
  1318  	MOVOU	48(DI), X7
  1319  	PCMPEQB	X1, X0
  1320  	PCMPEQB	X3, X2
  1321  	PCMPEQB	X5, X4
  1322  	PCMPEQB	X7, X6
  1323  	PAND	X2, X0
  1324  	PAND	X6, X4
  1325  	PAND	X4, X0
  1326  	PMOVMSKB X0, DX
  1327  	ADDQ	$64, SI
  1328  	ADDQ	$64, DI
  1329  	SUBQ	$64, BX
  1330  	CMPL	DX, $0xffff
  1331  	JEQ	hugeloop
  1332  	RET
  1333  
  1334  	// 8 bytes at a time using 64-bit register
  1335  bigloop:
  1336  	CMPQ	BX, $8
  1337  	JBE	leftover
  1338  	MOVQ	(SI), CX
  1339  	MOVQ	(DI), DX
  1340  	ADDQ	$8, SI
  1341  	ADDQ	$8, DI
  1342  	SUBQ	$8, BX
  1343  	CMPQ	CX, DX
  1344  	JEQ	bigloop
  1345  	RET
  1346  
  1347  	// remaining 0-8 bytes
  1348  leftover:
  1349  	MOVQ	-8(SI)(BX*1), CX
  1350  	MOVQ	-8(DI)(BX*1), DX
  1351  	CMPQ	CX, DX
  1352  	SETEQ	AX
  1353  	RET
  1354  
  1355  small:
  1356  	CMPQ	BX, $0
  1357  	JEQ	equal
  1358  
  1359  	LEAQ	0(BX*8), CX
  1360  	NEGQ	CX
  1361  
  1362  	CMPB	SI, $0xf8
  1363  	JA	si_high
  1364  
  1365  	// load at SI won't cross a page boundary.
  1366  	MOVQ	(SI), SI
  1367  	JMP	si_finish
  1368  si_high:
  1369  	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
  1370  	MOVQ	-8(SI)(BX*1), SI
  1371  	SHRQ	CX, SI
  1372  si_finish:
  1373  
  1374  	// same for DI.
  1375  	CMPB	DI, $0xf8
  1376  	JA	di_high
  1377  	MOVQ	(DI), DI
  1378  	JMP	di_finish
  1379  di_high:
  1380  	MOVQ	-8(DI)(BX*1), DI
  1381  	SHRQ	CX, DI
  1382  di_finish:
  1383  
  1384  	SUBQ	SI, DI
  1385  	SHLQ	CX, DI
  1386  equal:
  1387  	SETEQ	AX
  1388  	RET
  1389  
  1390  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1391  	MOVQ	s1_base+0(FP), SI
  1392  	MOVQ	s1_len+8(FP), BX
  1393  	MOVQ	s2_base+16(FP), DI
  1394  	MOVQ	s2_len+24(FP), DX
  1395  	CALL	runtime·cmpbody(SB)
  1396  	MOVQ	AX, ret+32(FP)
  1397  	RET
  1398  
  1399  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1400  	MOVQ	s1+0(FP), SI
  1401  	MOVQ	s1+8(FP), BX
  1402  	MOVQ	s2+24(FP), DI
  1403  	MOVQ	s2+32(FP), DX
  1404  	CALL	runtime·cmpbody(SB)
  1405  	MOVQ	AX, res+48(FP)
  1406  	RET
  1407  
  1408  // input:
  1409  //   SI = a
  1410  //   DI = b
  1411  //   BX = alen
  1412  //   DX = blen
  1413  // output:
  1414  //   AX = 1/0/-1
  1415  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1416  	CMPQ	SI, DI
  1417  	JEQ	allsame
  1418  	CMPQ	BX, DX
  1419  	MOVQ	DX, R8
  1420  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
  1421  	CMPQ	R8, $8
  1422  	JB	small
  1423  
  1424  loop:
  1425  	CMPQ	R8, $16
  1426  	JBE	_0through16
  1427  	MOVOU	(SI), X0
  1428  	MOVOU	(DI), X1
  1429  	PCMPEQB X0, X1
  1430  	PMOVMSKB X1, AX
  1431  	XORQ	$0xffff, AX	// convert EQ to NE
  1432  	JNE	diff16	// branch if at least one byte is not equal
  1433  	ADDQ	$16, SI
  1434  	ADDQ	$16, DI
  1435  	SUBQ	$16, R8
  1436  	JMP	loop
  1437  	
  1438  	// AX = bit mask of differences
  1439  diff16:
  1440  	BSFQ	AX, BX	// index of first byte that differs
  1441  	XORQ	AX, AX
  1442  	MOVB	(SI)(BX*1), CX
  1443  	CMPB	CX, (DI)(BX*1)
  1444  	SETHI	AX
  1445  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1446  	RET
  1447  
  1448  	// 0 through 16 bytes left, alen>=8, blen>=8
  1449  _0through16:
  1450  	CMPQ	R8, $8
  1451  	JBE	_0through8
  1452  	MOVQ	(SI), AX
  1453  	MOVQ	(DI), CX
  1454  	CMPQ	AX, CX
  1455  	JNE	diff8
  1456  _0through8:
  1457  	MOVQ	-8(SI)(R8*1), AX
  1458  	MOVQ	-8(DI)(R8*1), CX
  1459  	CMPQ	AX, CX
  1460  	JEQ	allsame
  1461  
  1462  	// AX and CX contain parts of a and b that differ.
  1463  diff8:
  1464  	BSWAPQ	AX	// reverse order of bytes
  1465  	BSWAPQ	CX
  1466  	XORQ	AX, CX
  1467  	BSRQ	CX, CX	// index of highest bit difference
  1468  	SHRQ	CX, AX	// move a's bit to bottom
  1469  	ANDQ	$1, AX	// mask bit
  1470  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1471  	RET
  1472  
  1473  	// 0-7 bytes in common
  1474  small:
  1475  	LEAQ	(R8*8), CX	// bytes left -> bits left
  1476  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1477  	JEQ	allsame
  1478  
  1479  	// load bytes of a into high bytes of AX
  1480  	CMPB	SI, $0xf8
  1481  	JA	si_high
  1482  	MOVQ	(SI), SI
  1483  	JMP	si_finish
  1484  si_high:
  1485  	MOVQ	-8(SI)(R8*1), SI
  1486  	SHRQ	CX, SI
  1487  si_finish:
  1488  	SHLQ	CX, SI
  1489  
  1490  	// load bytes of b in to high bytes of BX
  1491  	CMPB	DI, $0xf8
  1492  	JA	di_high
  1493  	MOVQ	(DI), DI
  1494  	JMP	di_finish
  1495  di_high:
  1496  	MOVQ	-8(DI)(R8*1), DI
  1497  	SHRQ	CX, DI
  1498  di_finish:
  1499  	SHLQ	CX, DI
  1500  
  1501  	BSWAPQ	SI	// reverse order of bytes
  1502  	BSWAPQ	DI
  1503  	XORQ	SI, DI	// find bit differences
  1504  	JEQ	allsame
  1505  	BSRQ	DI, CX	// index of highest bit difference
  1506  	SHRQ	CX, SI	// move a's bit to bottom
  1507  	ANDQ	$1, SI	// mask bit
  1508  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1509  	RET
  1510  
  1511  allsame:
  1512  	XORQ	AX, AX
  1513  	XORQ	CX, CX
  1514  	CMPQ	BX, DX
  1515  	SETGT	AX	// 1 if alen > blen
  1516  	SETEQ	CX	// 1 if alen == blen
  1517  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1518  	RET
  1519  
  1520  TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
  1521  	MOVQ s+0(FP), SI
  1522  	MOVQ s_len+8(FP), BX
  1523  	MOVB c+24(FP), AL
  1524  	CALL runtime·indexbytebody(SB)
  1525  	MOVQ AX, ret+32(FP)
  1526  	RET
  1527  
  1528  TEXT strings·IndexByte(SB),NOSPLIT,$0-32
  1529  	MOVQ s+0(FP), SI
  1530  	MOVQ s_len+8(FP), BX
  1531  	MOVB c+16(FP), AL
  1532  	CALL runtime·indexbytebody(SB)
  1533  	MOVQ AX, ret+24(FP)
  1534  	RET
  1535  
  1536  // input:
  1537  //   SI: data
  1538  //   BX: data len
  1539  //   AL: byte sought
  1540  // output:
  1541  //   AX
  1542  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1543  	MOVQ SI, DI
  1544  
  1545  	CMPQ BX, $16
  1546  	JLT small
  1547  
  1548  	// round up to first 16-byte boundary
  1549  	TESTQ $15, SI
  1550  	JZ aligned
  1551  	MOVQ SI, CX
  1552  	ANDQ $~15, CX
  1553  	ADDQ $16, CX
  1554  
  1555  	// search the beginning
  1556  	SUBQ SI, CX
  1557  	REPN; SCASB
  1558  	JZ success
  1559  
  1560  // DI is 16-byte aligned; get ready to search using SSE instructions
  1561  aligned:
  1562  	// round down to last 16-byte boundary
  1563  	MOVQ BX, R11
  1564  	ADDQ SI, R11
  1565  	ANDQ $~15, R11
  1566  
  1567  	// shuffle X0 around so that each byte contains c
  1568  	MOVD AX, X0
  1569  	PUNPCKLBW X0, X0
  1570  	PUNPCKLBW X0, X0
  1571  	PSHUFL $0, X0, X0
  1572  	JMP condition
  1573  
  1574  sse:
  1575  	// move the next 16-byte chunk of the buffer into X1
  1576  	MOVO (DI), X1
  1577  	// compare bytes in X0 to X1
  1578  	PCMPEQB X0, X1
  1579  	// take the top bit of each byte in X1 and put the result in DX
  1580  	PMOVMSKB X1, DX
  1581  	TESTL DX, DX
  1582  	JNZ ssesuccess
  1583  	ADDQ $16, DI
  1584  
  1585  condition:
  1586  	CMPQ DI, R11
  1587  	JLT sse
  1588  
  1589  	// search the end
  1590  	MOVQ SI, CX
  1591  	ADDQ BX, CX
  1592  	SUBQ R11, CX
  1593  	// if CX == 0, the zero flag will be set and we'll end up
  1594  	// returning a false success
  1595  	JZ failure
  1596  	REPN; SCASB
  1597  	JZ success
  1598  
  1599  failure:
  1600  	MOVQ $-1, AX
  1601  	RET
  1602  
  1603  // handle for lengths < 16
  1604  small:
  1605  	MOVQ BX, CX
  1606  	REPN; SCASB
  1607  	JZ success
  1608  	MOVQ $-1, AX
  1609  	RET
  1610  
  1611  // we've found the chunk containing the byte
  1612  // now just figure out which specific byte it is
  1613  ssesuccess:
  1614  	// get the index of the least significant set bit
  1615  	BSFW DX, DX
  1616  	SUBQ SI, DI
  1617  	ADDQ DI, DX
  1618  	MOVQ DX, AX
  1619  	RET
  1620  
  1621  success:
  1622  	SUBQ SI, DI
  1623  	SUBL $1, DI
  1624  	MOVQ DI, AX
  1625  	RET
  1626  
  1627  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  1628  	MOVQ	a_len+8(FP), BX
  1629  	MOVQ	b_len+32(FP), CX
  1630  	XORQ	AX, AX
  1631  	CMPQ	BX, CX
  1632  	JNE	eqret
  1633  	MOVQ	a+0(FP), SI
  1634  	MOVQ	b+24(FP), DI
  1635  	CALL	runtime·memeqbody(SB)
  1636  eqret:
  1637  	MOVB	AX, ret+48(FP)
  1638  	RET
  1639  
  1640  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1641  	get_tls(CX)
  1642  	MOVQ	g(CX), AX
  1643  	MOVQ	g_m(AX), AX
  1644  	MOVL	m_fastrand(AX), DX
  1645  	ADDL	DX, DX
  1646  	MOVL	DX, BX
  1647  	XORL	$0x88888eef, DX
  1648  	CMOVLMI	BX, DX
  1649  	MOVL	DX, m_fastrand(AX)
  1650  	MOVL	DX, ret+0(FP)
  1651  	RET
  1652  
  1653  TEXT runtime·return0(SB), NOSPLIT, $0
  1654  	MOVL	$0, AX
  1655  	RET
  1656  
  1657  
  1658  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1659  // Must obey the gcc calling convention.
  1660  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1661  	get_tls(CX)
  1662  	MOVQ	g(CX), AX
  1663  	MOVQ	g_m(AX), AX
  1664  	MOVQ	m_curg(AX), AX
  1665  	MOVQ	(g_stack+stack_hi)(AX), AX
  1666  	RET
  1667  
  1668  // The top-most function running on a goroutine
  1669  // returns to goexit+PCQuantum.
  1670  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1671  	BYTE	$0x90	// NOP
  1672  	CALL	runtime·goexit1(SB)	// does not return
  1673  	// traceback from goexit1 must hit code range of goexit
  1674  	BYTE	$0x90	// NOP
  1675  
  1676  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
  1677  	MOVQ	addr+0(FP), AX
  1678  	PREFETCHT0	(AX)
  1679  	RET
  1680  
  1681  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
  1682  	MOVQ	addr+0(FP), AX
  1683  	PREFETCHT1	(AX)
  1684  	RET
  1685  
  1686  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
  1687  	MOVQ	addr+0(FP), AX
  1688  	PREFETCHT2	(AX)
  1689  	RET
  1690  
  1691  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
  1692  	MOVQ	addr+0(FP), AX
  1693  	PREFETCHNTA	(AX)
  1694  	RET
  1695  
  1696  // This is called from .init_array and follows the platform, not Go, ABI.
  1697  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-8
  1698  	MOVQ	runtime·lastmoduledatap(SB), AX
  1699  	MOVQ	DI, moduledata_next(AX)
  1700  	MOVQ	DI, runtime·lastmoduledatap(SB)
  1701  	RET