github.com/jonasi/go@v0.0.0-20150930005915-e78e654c1de0/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	SUBL	$128, SP		// plenty of scratch
    15  	ANDL	$~15, SP
    16  	MOVL	AX, 120(SP)		// save argc, argv away
    17  	MOVL	BX, 124(SP)
    18  
    19  	// set default stack bounds.
    20  	// _cgo_init may update stackguard.
    21  	MOVL	$runtime·g0(SB), BP
    22  	LEAL	(-64*1024+104)(SP), BX
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	BX, g_stackguard1(BP)
    25  	MOVL	BX, (g_stack+stack_lo)(BP)
    26  	MOVL	SP, (g_stack+stack_hi)(BP)
    27  	
    28  	// find out information about the processor we're on
    29  	MOVL	$0, AX
    30  	CPUID
    31  	CMPL	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  
    46  	MOVL	$1, AX
    47  	CPUID
    48  	MOVL	CX, runtime·cpuid_ecx(SB)
    49  	MOVL	DX, runtime·cpuid_edx(SB)
    50  nocpuinfo:	
    51  
    52  	// if there is an _cgo_init, call it to let it
    53  	// initialize and to set up GS.  if not,
    54  	// we set up GS ourselves.
    55  	MOVL	_cgo_init(SB), AX
    56  	TESTL	AX, AX
    57  	JZ	needtls
    58  	MOVL	$setg_gcc<>(SB), BX
    59  	MOVL	BX, 4(SP)
    60  	MOVL	BP, 0(SP)
    61  	CALL	AX
    62  
    63  	// update stackguard after _cgo_init
    64  	MOVL	$runtime·g0(SB), CX
    65  	MOVL	(g_stack+stack_lo)(CX), AX
    66  	ADDL	$const__StackGuard, AX
    67  	MOVL	AX, g_stackguard0(CX)
    68  	MOVL	AX, g_stackguard1(CX)
    69  
    70  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
    71  	CMPL runtime·iswindows(SB), $0
    72  	JEQ ok
    73  needtls:
    74  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
    75  	CMPL	runtime·isplan9(SB), $1
    76  	JEQ	ok
    77  
    78  	// set up %gs
    79  	CALL	runtime·ldt0setup(SB)
    80  
    81  	// store through it, to make sure it works
    82  	get_tls(BX)
    83  	MOVL	$0x123, g(BX)
    84  	MOVL	runtime·tls0(SB), AX
    85  	CMPL	AX, $0x123
    86  	JEQ	ok
    87  	MOVL	AX, 0	// abort
    88  ok:
    89  	// set up m and g "registers"
    90  	get_tls(BX)
    91  	LEAL	runtime·g0(SB), CX
    92  	MOVL	CX, g(BX)
    93  	LEAL	runtime·m0(SB), AX
    94  
    95  	// save m->g0 = g0
    96  	MOVL	CX, m_g0(AX)
    97  	// save g0->m = m0
    98  	MOVL	AX, g_m(CX)
    99  
   100  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   101  
   102  	// convention is D is always cleared
   103  	CLD
   104  
   105  	CALL	runtime·check(SB)
   106  
   107  	// saved argc, argv
   108  	MOVL	120(SP), AX
   109  	MOVL	AX, 0(SP)
   110  	MOVL	124(SP), AX
   111  	MOVL	AX, 4(SP)
   112  	CALL	runtime·args(SB)
   113  	CALL	runtime·osinit(SB)
   114  	CALL	runtime·schedinit(SB)
   115  
   116  	// create a new goroutine to start program
   117  	PUSHL	$runtime·mainPC(SB)	// entry
   118  	PUSHL	$0	// arg size
   119  	CALL	runtime·newproc(SB)
   120  	POPL	AX
   121  	POPL	AX
   122  
   123  	// start this M
   124  	CALL	runtime·mstart(SB)
   125  
   126  	INT $3
   127  	RET
   128  
   129  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   130  GLOBL	runtime·mainPC(SB),RODATA,$4
   131  
   132  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   133  	INT $3
   134  	RET
   135  
   136  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   137  	// Linux and MinGW start the FPU in extended double precision.
   138  	// Other operating systems use double precision.
   139  	// Change to double precision to match them,
   140  	// and to match other hardware that only has double.
   141  	PUSHL $0x27F
   142  	FLDCW	0(SP)
   143  	POPL AX
   144  	RET
   145  
   146  /*
   147   *  go-routine
   148   */
   149  
   150  // void gosave(Gobuf*)
   151  // save state in Gobuf; setjmp
   152  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   153  	MOVL	buf+0(FP), AX		// gobuf
   154  	LEAL	buf+0(FP), BX		// caller's SP
   155  	MOVL	BX, gobuf_sp(AX)
   156  	MOVL	0(SP), BX		// caller's PC
   157  	MOVL	BX, gobuf_pc(AX)
   158  	MOVL	$0, gobuf_ret(AX)
   159  	MOVL	$0, gobuf_ctxt(AX)
   160  	get_tls(CX)
   161  	MOVL	g(CX), BX
   162  	MOVL	BX, gobuf_g(AX)
   163  	RET
   164  
   165  // void gogo(Gobuf*)
   166  // restore state from Gobuf; longjmp
   167  TEXT runtime·gogo(SB), NOSPLIT, $0-4
   168  	MOVL	buf+0(FP), BX		// gobuf
   169  	MOVL	gobuf_g(BX), DX
   170  	MOVL	0(DX), CX		// make sure g != nil
   171  	get_tls(CX)
   172  	MOVL	DX, g(CX)
   173  	MOVL	gobuf_sp(BX), SP	// restore SP
   174  	MOVL	gobuf_ret(BX), AX
   175  	MOVL	gobuf_ctxt(BX), DX
   176  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   177  	MOVL	$0, gobuf_ret(BX)
   178  	MOVL	$0, gobuf_ctxt(BX)
   179  	MOVL	gobuf_pc(BX), BX
   180  	JMP	BX
   181  
   182  // func mcall(fn func(*g))
   183  // Switch to m->g0's stack, call fn(g).
   184  // Fn must never return.  It should gogo(&g->sched)
   185  // to keep running g.
   186  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   187  	MOVL	fn+0(FP), DI
   188  	
   189  	get_tls(CX)
   190  	MOVL	g(CX), AX	// save state in g->sched
   191  	MOVL	0(SP), BX	// caller's PC
   192  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   193  	LEAL	fn+0(FP), BX	// caller's SP
   194  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   195  	MOVL	AX, (g_sched+gobuf_g)(AX)
   196  
   197  	// switch to m->g0 & its stack, call fn
   198  	MOVL	g(CX), BX
   199  	MOVL	g_m(BX), BX
   200  	MOVL	m_g0(BX), SI
   201  	CMPL	SI, AX	// if g == m->g0 call badmcall
   202  	JNE	3(PC)
   203  	MOVL	$runtime·badmcall(SB), AX
   204  	JMP	AX
   205  	MOVL	SI, g(CX)	// g = m->g0
   206  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   207  	PUSHL	AX
   208  	MOVL	DI, DX
   209  	MOVL	0(DI), DI
   210  	CALL	DI
   211  	POPL	AX
   212  	MOVL	$runtime·badmcall2(SB), AX
   213  	JMP	AX
   214  	RET
   215  
   216  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   217  // of the G stack.  We need to distinguish the routine that
   218  // lives at the bottom of the G stack from the one that lives
   219  // at the top of the system stack because the one at the top of
   220  // the system stack terminates the stack walk (see topofstack()).
   221  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   222  	RET
   223  
   224  // func systemstack(fn func())
   225  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   226  	MOVL	fn+0(FP), DI	// DI = fn
   227  	get_tls(CX)
   228  	MOVL	g(CX), AX	// AX = g
   229  	MOVL	g_m(AX), BX	// BX = m
   230  
   231  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   232  	CMPL	AX, DX
   233  	JEQ	noswitch
   234  
   235  	MOVL	m_g0(BX), DX	// DX = g0
   236  	CMPL	AX, DX
   237  	JEQ	noswitch
   238  
   239  	MOVL	m_curg(BX), BP
   240  	CMPL	AX, BP
   241  	JEQ	switch
   242  	
   243  	// Bad: g is not gsignal, not g0, not curg. What is it?
   244  	// Hide call from linker nosplit analysis.
   245  	MOVL	$runtime·badsystemstack(SB), AX
   246  	CALL	AX
   247  
   248  switch:
   249  	// save our state in g->sched.  Pretend to
   250  	// be systemstack_switch if the G stack is scanned.
   251  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   252  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   253  	MOVL	AX, (g_sched+gobuf_g)(AX)
   254  
   255  	// switch to g0
   256  	MOVL	DX, g(CX)
   257  	MOVL	(g_sched+gobuf_sp)(DX), BX
   258  	// make it look like mstart called systemstack on g0, to stop traceback
   259  	SUBL	$4, BX
   260  	MOVL	$runtime·mstart(SB), DX
   261  	MOVL	DX, 0(BX)
   262  	MOVL	BX, SP
   263  
   264  	// call target function
   265  	MOVL	DI, DX
   266  	MOVL	0(DI), DI
   267  	CALL	DI
   268  
   269  	// switch back to g
   270  	get_tls(CX)
   271  	MOVL	g(CX), AX
   272  	MOVL	g_m(AX), BX
   273  	MOVL	m_curg(BX), AX
   274  	MOVL	AX, g(CX)
   275  	MOVL	(g_sched+gobuf_sp)(AX), SP
   276  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   277  	RET
   278  
   279  noswitch:
   280  	// already on system stack, just call directly
   281  	MOVL	DI, DX
   282  	MOVL	0(DI), DI
   283  	CALL	DI
   284  	RET
   285  
   286  /*
   287   * support for morestack
   288   */
   289  
   290  // Called during function prolog when more stack is needed.
   291  //
   292  // The traceback routines see morestack on a g0 as being
   293  // the top of a stack (for example, morestack calling newstack
   294  // calling the scheduler calling newm calling gc), so we must
   295  // record an argument size. For that purpose, it has no arguments.
   296  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   297  	// Cannot grow scheduler stack (m->g0).
   298  	get_tls(CX)
   299  	MOVL	g(CX), BX
   300  	MOVL	g_m(BX), BX
   301  	MOVL	m_g0(BX), SI
   302  	CMPL	g(CX), SI
   303  	JNE	2(PC)
   304  	INT	$3
   305  
   306  	// Cannot grow signal stack.
   307  	MOVL	m_gsignal(BX), SI
   308  	CMPL	g(CX), SI
   309  	JNE	2(PC)
   310  	INT	$3
   311  
   312  	// Called from f.
   313  	// Set m->morebuf to f's caller.
   314  	MOVL	4(SP), DI	// f's caller's PC
   315  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   316  	LEAL	8(SP), CX	// f's caller's SP
   317  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   318  	get_tls(CX)
   319  	MOVL	g(CX), SI
   320  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   321  
   322  	// Set g->sched to context in f.
   323  	MOVL	0(SP), AX	// f's PC
   324  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   325  	MOVL	SI, (g_sched+gobuf_g)(SI)
   326  	LEAL	4(SP), AX	// f's SP
   327  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   328  	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
   329  
   330  	// Call newstack on m->g0's stack.
   331  	MOVL	m_g0(BX), BP
   332  	MOVL	BP, g(CX)
   333  	MOVL	(g_sched+gobuf_sp)(BP), AX
   334  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   335  	MOVL	AX, SP
   336  	CALL	runtime·newstack(SB)
   337  	MOVL	$0, 0x1003	// crash if newstack returns
   338  	RET
   339  
   340  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   341  	MOVL	$0, DX
   342  	JMP runtime·morestack(SB)
   343  
   344  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   345  	// We came here via a RET to an overwritten return PC.
   346  	// AX may be live. Other registers are available.
   347  
   348  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   349  	get_tls(CX)
   350  	MOVL	g(CX), CX
   351  	MOVL	(g_stkbar+slice_array)(CX), DX
   352  	MOVL	g_stkbarPos(CX), BX
   353  	IMULL	$stkbar__size, BX	// Too big for SIB.
   354  	MOVL	stkbar_savedLRVal(DX)(BX*1), BX
   355  	// Record that this stack barrier was hit.
   356  	ADDL	$1, g_stkbarPos(CX)
   357  	// Jump to the original return PC.
   358  	JMP	BX
   359  
   360  // reflectcall: call a function with the given argument list
   361  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   362  // we don't have variable-sized frames, so we use a small number
   363  // of constant-sized-frame functions to encode a few bits of size in the pc.
   364  // Caution: ugly multiline assembly macros in your future!
   365  
   366  #define DISPATCH(NAME,MAXSIZE)		\
   367  	CMPL	CX, $MAXSIZE;		\
   368  	JA	3(PC);			\
   369  	MOVL	$NAME(SB), AX;		\
   370  	JMP	AX
   371  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   372  
   373  TEXT reflect·call(SB), NOSPLIT, $0-0
   374  	JMP	·reflectcall(SB)
   375  
   376  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   377  	MOVL	argsize+12(FP), CX
   378  	DISPATCH(runtime·call16, 16)
   379  	DISPATCH(runtime·call32, 32)
   380  	DISPATCH(runtime·call64, 64)
   381  	DISPATCH(runtime·call128, 128)
   382  	DISPATCH(runtime·call256, 256)
   383  	DISPATCH(runtime·call512, 512)
   384  	DISPATCH(runtime·call1024, 1024)
   385  	DISPATCH(runtime·call2048, 2048)
   386  	DISPATCH(runtime·call4096, 4096)
   387  	DISPATCH(runtime·call8192, 8192)
   388  	DISPATCH(runtime·call16384, 16384)
   389  	DISPATCH(runtime·call32768, 32768)
   390  	DISPATCH(runtime·call65536, 65536)
   391  	DISPATCH(runtime·call131072, 131072)
   392  	DISPATCH(runtime·call262144, 262144)
   393  	DISPATCH(runtime·call524288, 524288)
   394  	DISPATCH(runtime·call1048576, 1048576)
   395  	DISPATCH(runtime·call2097152, 2097152)
   396  	DISPATCH(runtime·call4194304, 4194304)
   397  	DISPATCH(runtime·call8388608, 8388608)
   398  	DISPATCH(runtime·call16777216, 16777216)
   399  	DISPATCH(runtime·call33554432, 33554432)
   400  	DISPATCH(runtime·call67108864, 67108864)
   401  	DISPATCH(runtime·call134217728, 134217728)
   402  	DISPATCH(runtime·call268435456, 268435456)
   403  	DISPATCH(runtime·call536870912, 536870912)
   404  	DISPATCH(runtime·call1073741824, 1073741824)
   405  	MOVL	$runtime·badreflectcall(SB), AX
   406  	JMP	AX
   407  
   408  #define CALLFN(NAME,MAXSIZE)			\
   409  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   410  	NO_LOCAL_POINTERS;			\
   411  	/* copy arguments to stack */		\
   412  	MOVL	argptr+8(FP), SI;		\
   413  	MOVL	argsize+12(FP), CX;		\
   414  	MOVL	SP, DI;				\
   415  	REP;MOVSB;				\
   416  	/* call function */			\
   417  	MOVL	f+4(FP), DX;			\
   418  	MOVL	(DX), AX; 			\
   419  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   420  	CALL	AX;				\
   421  	/* copy return values back */		\
   422  	MOVL	argptr+8(FP), DI;		\
   423  	MOVL	argsize+12(FP), CX;		\
   424  	MOVL	retoffset+16(FP), BX;		\
   425  	MOVL	SP, SI;				\
   426  	ADDL	BX, DI;				\
   427  	ADDL	BX, SI;				\
   428  	SUBL	BX, CX;				\
   429  	REP;MOVSB;				\
   430  	/* execute write barrier updates */	\
   431  	MOVL	argtype+0(FP), DX;		\
   432  	MOVL	argptr+8(FP), DI;		\
   433  	MOVL	argsize+12(FP), CX;		\
   434  	MOVL	retoffset+16(FP), BX;		\
   435  	MOVL	DX, 0(SP);			\
   436  	MOVL	DI, 4(SP);			\
   437  	MOVL	CX, 8(SP);			\
   438  	MOVL	BX, 12(SP);			\
   439  	CALL	runtime·callwritebarrier(SB);	\
   440  	RET
   441  
   442  CALLFN(·call16, 16)
   443  CALLFN(·call32, 32)
   444  CALLFN(·call64, 64)
   445  CALLFN(·call128, 128)
   446  CALLFN(·call256, 256)
   447  CALLFN(·call512, 512)
   448  CALLFN(·call1024, 1024)
   449  CALLFN(·call2048, 2048)
   450  CALLFN(·call4096, 4096)
   451  CALLFN(·call8192, 8192)
   452  CALLFN(·call16384, 16384)
   453  CALLFN(·call32768, 32768)
   454  CALLFN(·call65536, 65536)
   455  CALLFN(·call131072, 131072)
   456  CALLFN(·call262144, 262144)
   457  CALLFN(·call524288, 524288)
   458  CALLFN(·call1048576, 1048576)
   459  CALLFN(·call2097152, 2097152)
   460  CALLFN(·call4194304, 4194304)
   461  CALLFN(·call8388608, 8388608)
   462  CALLFN(·call16777216, 16777216)
   463  CALLFN(·call33554432, 33554432)
   464  CALLFN(·call67108864, 67108864)
   465  CALLFN(·call134217728, 134217728)
   466  CALLFN(·call268435456, 268435456)
   467  CALLFN(·call536870912, 536870912)
   468  CALLFN(·call1073741824, 1073741824)
   469  
   470  // bool cas(int32 *val, int32 old, int32 new)
   471  // Atomically:
   472  //	if(*val == old){
   473  //		*val = new;
   474  //		return 1;
   475  //	}else
   476  //		return 0;
   477  TEXT runtime·cas(SB), NOSPLIT, $0-13
   478  	MOVL	ptr+0(FP), BX
   479  	MOVL	old+4(FP), AX
   480  	MOVL	new+8(FP), CX
   481  	LOCK
   482  	CMPXCHGL	CX, 0(BX)
   483  	SETEQ	ret+12(FP)
   484  	RET
   485  
   486  TEXT runtime·casuintptr(SB), NOSPLIT, $0-13
   487  	JMP	runtime·cas(SB)
   488  
   489  TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-8
   490  	JMP	runtime·atomicload(SB)
   491  
   492  TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-8
   493  	JMP	runtime·atomicload(SB)
   494  
   495  TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-8
   496  	JMP	runtime·atomicstore(SB)
   497  
   498  // bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
   499  // Atomically:
   500  //	if(*val == *old){
   501  //		*val = new;
   502  //		return 1;
   503  //	} else {
   504  //		return 0;
   505  //	}
   506  TEXT runtime·cas64(SB), NOSPLIT, $0-21
   507  	MOVL	ptr+0(FP), BP
   508  	MOVL	old_lo+4(FP), AX
   509  	MOVL	old_hi+8(FP), DX
   510  	MOVL	new_lo+12(FP), BX
   511  	MOVL	new_hi+16(FP), CX
   512  	LOCK
   513  	CMPXCHG8B	0(BP)
   514  	SETEQ	ret+20(FP)
   515  	RET
   516  
   517  // bool casp(void **p, void *old, void *new)
   518  // Atomically:
   519  //	if(*p == old){
   520  //		*p = new;
   521  //		return 1;
   522  //	}else
   523  //		return 0;
   524  TEXT runtime·casp1(SB), NOSPLIT, $0-13
   525  	MOVL	ptr+0(FP), BX
   526  	MOVL	old+4(FP), AX
   527  	MOVL	new+8(FP), CX
   528  	LOCK
   529  	CMPXCHGL	CX, 0(BX)
   530  	SETEQ	ret+12(FP)
   531  	RET
   532  
   533  // uint32 xadd(uint32 volatile *val, int32 delta)
   534  // Atomically:
   535  //	*val += delta;
   536  //	return *val;
   537  TEXT runtime·xadd(SB), NOSPLIT, $0-12
   538  	MOVL	ptr+0(FP), BX
   539  	MOVL	delta+4(FP), AX
   540  	MOVL	AX, CX
   541  	LOCK
   542  	XADDL	AX, 0(BX)
   543  	ADDL	CX, AX
   544  	MOVL	AX, ret+8(FP)
   545  	RET
   546  
   547  TEXT runtime·xchg(SB), NOSPLIT, $0-12
   548  	MOVL	ptr+0(FP), BX
   549  	MOVL	new+4(FP), AX
   550  	XCHGL	AX, 0(BX)
   551  	MOVL	AX, ret+8(FP)
   552  	RET
   553  
   554  TEXT runtime·xchguintptr(SB), NOSPLIT, $0-12
   555  	JMP	runtime·xchg(SB)
   556  
   557  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   558  	MOVL	cycles+0(FP), AX
   559  again:
   560  	PAUSE
   561  	SUBL	$1, AX
   562  	JNZ	again
   563  	RET
   564  
   565  TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8
   566  	MOVL	ptr+0(FP), BX
   567  	MOVL	val+4(FP), AX
   568  	XCHGL	AX, 0(BX)
   569  	RET
   570  
   571  TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
   572  	MOVL	ptr+0(FP), BX
   573  	MOVL	val+4(FP), AX
   574  	XCHGL	AX, 0(BX)
   575  	RET
   576  
   577  // uint64 atomicload64(uint64 volatile* addr);
   578  TEXT runtime·atomicload64(SB), NOSPLIT, $0-12
   579  	MOVL	ptr+0(FP), AX
   580  	TESTL	$7, AX
   581  	JZ	2(PC)
   582  	MOVL	0, AX // crash with nil ptr deref
   583  	LEAL	ret_lo+4(FP), BX
   584  	// MOVQ (%EAX), %MM0
   585  	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
   586  	// MOVQ %MM0, 0(%EBX)
   587  	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
   588  	// EMMS
   589  	BYTE $0x0F; BYTE $0x77
   590  	RET
   591  
   592  // void runtime·atomicstore64(uint64 volatile* addr, uint64 v);
   593  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12
   594  	MOVL	ptr+0(FP), AX
   595  	TESTL	$7, AX
   596  	JZ	2(PC)
   597  	MOVL	0, AX // crash with nil ptr deref
   598  	// MOVQ and EMMS were introduced on the Pentium MMX.
   599  	// MOVQ 0x8(%ESP), %MM0
   600  	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
   601  	// MOVQ %MM0, (%EAX)
   602  	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
   603  	// EMMS
   604  	BYTE $0x0F; BYTE $0x77
   605  	// This is essentially a no-op, but it provides required memory fencing.
   606  	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
   607  	MOVL	$0, AX
   608  	LOCK
   609  	XADDL	AX, (SP)
   610  	RET
   611  
   612  // void	runtime·atomicor8(byte volatile*, byte);
   613  TEXT runtime·atomicor8(SB), NOSPLIT, $0-5
   614  	MOVL	ptr+0(FP), AX
   615  	MOVB	val+4(FP), BX
   616  	LOCK
   617  	ORB	BX, (AX)
   618  	RET
   619  
   620  // void	runtime·atomicand8(byte volatile*, byte);
   621  TEXT runtime·atomicand8(SB), NOSPLIT, $0-5
   622  	MOVL	ptr+0(FP), AX
   623  	MOVB	val+4(FP), BX
   624  	LOCK
   625  	ANDB	BX, (AX)
   626  	RET
   627  
   628  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   629  	// Stores are already ordered on x86, so this is just a
   630  	// compile barrier.
   631  	RET
   632  
   633  // void jmpdefer(fn, sp);
   634  // called from deferreturn.
   635  // 1. pop the caller
   636  // 2. sub 5 bytes from the callers return
   637  // 3. jmp to the argument
   638  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   639  	MOVL	fv+0(FP), DX	// fn
   640  	MOVL	argp+4(FP), BX	// caller sp
   641  	LEAL	-4(BX), SP	// caller sp after CALL
   642  	SUBL	$5, (SP)	// return to CALL again
   643  	MOVL	0(DX), BX
   644  	JMP	BX	// but first run the deferred function
   645  
   646  // Save state of caller into g->sched.
   647  TEXT gosave<>(SB),NOSPLIT,$0
   648  	PUSHL	AX
   649  	PUSHL	BX
   650  	get_tls(BX)
   651  	MOVL	g(BX), BX
   652  	LEAL	arg+0(FP), AX
   653  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   654  	MOVL	-4(AX), AX
   655  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   656  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   657  	MOVL	$0, (g_sched+gobuf_ctxt)(BX)
   658  	POPL	BX
   659  	POPL	AX
   660  	RET
   661  
   662  // func asmcgocall(fn, arg unsafe.Pointer) int32
   663  // Call fn(arg) on the scheduler stack,
   664  // aligned appropriately for the gcc ABI.
   665  // See cgocall.go for more details.
   666  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   667  	MOVL	fn+0(FP), AX
   668  	MOVL	arg+4(FP), BX
   669  
   670  	MOVL	SP, DX
   671  
   672  	// Figure out if we need to switch to m->g0 stack.
   673  	// We get called to create new OS threads too, and those
   674  	// come in on the m->g0 stack already.
   675  	get_tls(CX)
   676  	MOVL	g(CX), BP
   677  	MOVL	g_m(BP), BP
   678  	MOVL	m_g0(BP), SI
   679  	MOVL	g(CX), DI
   680  	CMPL	SI, DI
   681  	JEQ	4(PC)
   682  	CALL	gosave<>(SB)
   683  	MOVL	SI, g(CX)
   684  	MOVL	(g_sched+gobuf_sp)(SI), SP
   685  
   686  	// Now on a scheduling stack (a pthread-created stack).
   687  	SUBL	$32, SP
   688  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   689  	MOVL	DI, 8(SP)	// save g
   690  	MOVL	(g_stack+stack_hi)(DI), DI
   691  	SUBL	DX, DI
   692  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   693  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   694  	CALL	AX
   695  
   696  	// Restore registers, g, stack pointer.
   697  	get_tls(CX)
   698  	MOVL	8(SP), DI
   699  	MOVL	(g_stack+stack_hi)(DI), SI
   700  	SUBL	4(SP), SI
   701  	MOVL	DI, g(CX)
   702  	MOVL	SI, SP
   703  
   704  	MOVL	AX, ret+8(FP)
   705  	RET
   706  
   707  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   708  // Turn the fn into a Go func (by taking its address) and call
   709  // cgocallback_gofunc.
   710  TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
   711  	LEAL	fn+0(FP), AX
   712  	MOVL	AX, 0(SP)
   713  	MOVL	frame+4(FP), AX
   714  	MOVL	AX, 4(SP)
   715  	MOVL	framesize+8(FP), AX
   716  	MOVL	AX, 8(SP)
   717  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   718  	CALL	AX
   719  	RET
   720  
   721  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   722  // See cgocall.go for more details.
   723  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-12
   724  	NO_LOCAL_POINTERS
   725  
   726  	// If g is nil, Go did not create the current thread.
   727  	// Call needm to obtain one for temporary use.
   728  	// In this case, we're running on the thread stack, so there's
   729  	// lots of space, but the linker doesn't know. Hide the call from
   730  	// the linker analysis by using an indirect call through AX.
   731  	get_tls(CX)
   732  #ifdef GOOS_windows
   733  	MOVL	$0, BP
   734  	CMPL	CX, $0
   735  	JEQ	2(PC) // TODO
   736  #endif
   737  	MOVL	g(CX), BP
   738  	CMPL	BP, $0
   739  	JEQ	needm
   740  	MOVL	g_m(BP), BP
   741  	MOVL	BP, DX // saved copy of oldm
   742  	JMP	havem
   743  needm:
   744  	MOVL	$0, 0(SP)
   745  	MOVL	$runtime·needm(SB), AX
   746  	CALL	AX
   747  	MOVL	0(SP), DX
   748  	get_tls(CX)
   749  	MOVL	g(CX), BP
   750  	MOVL	g_m(BP), BP
   751  
   752  	// Set m->sched.sp = SP, so that if a panic happens
   753  	// during the function we are about to execute, it will
   754  	// have a valid SP to run on the g0 stack.
   755  	// The next few lines (after the havem label)
   756  	// will save this SP onto the stack and then write
   757  	// the same SP back to m->sched.sp. That seems redundant,
   758  	// but if an unrecovered panic happens, unwindm will
   759  	// restore the g->sched.sp from the stack location
   760  	// and then systemstack will try to use it. If we don't set it here,
   761  	// that restored SP will be uninitialized (typically 0) and
   762  	// will not be usable.
   763  	MOVL	m_g0(BP), SI
   764  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   765  
   766  havem:
   767  	// Now there's a valid m, and we're running on its m->g0.
   768  	// Save current m->g0->sched.sp on stack and then set it to SP.
   769  	// Save current sp in m->g0->sched.sp in preparation for
   770  	// switch back to m->curg stack.
   771  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   772  	MOVL	m_g0(BP), SI
   773  	MOVL	(g_sched+gobuf_sp)(SI), AX
   774  	MOVL	AX, 0(SP)
   775  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   776  
   777  	// Switch to m->curg stack and call runtime.cgocallbackg.
   778  	// Because we are taking over the execution of m->curg
   779  	// but *not* resuming what had been running, we need to
   780  	// save that information (m->curg->sched) so we can restore it.
   781  	// We can restore m->curg->sched.sp easily, because calling
   782  	// runtime.cgocallbackg leaves SP unchanged upon return.
   783  	// To save m->curg->sched.pc, we push it onto the stack.
   784  	// This has the added benefit that it looks to the traceback
   785  	// routine like cgocallbackg is going to return to that
   786  	// PC (because the frame we allocate below has the same
   787  	// size as cgocallback_gofunc's frame declared above)
   788  	// so that the traceback will seamlessly trace back into
   789  	// the earlier calls.
   790  	//
   791  	// In the new goroutine, 0(SP) holds the saved oldm (DX) register.
   792  	// 4(SP) and 8(SP) are unused.
   793  	MOVL	m_curg(BP), SI
   794  	MOVL	SI, g(CX)
   795  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   796  	MOVL	(g_sched+gobuf_pc)(SI), BP
   797  	MOVL	BP, -4(DI)
   798  	LEAL	-(4+12)(DI), SP
   799  	MOVL	DX, 0(SP)
   800  	CALL	runtime·cgocallbackg(SB)
   801  	MOVL	0(SP), DX
   802  
   803  	// Restore g->sched (== m->curg->sched) from saved values.
   804  	get_tls(CX)
   805  	MOVL	g(CX), SI
   806  	MOVL	12(SP), BP
   807  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   808  	LEAL	(12+4)(SP), DI
   809  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   810  
   811  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   812  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   813  	// so we do not have to restore it.)
   814  	MOVL	g(CX), BP
   815  	MOVL	g_m(BP), BP
   816  	MOVL	m_g0(BP), SI
   817  	MOVL	SI, g(CX)
   818  	MOVL	(g_sched+gobuf_sp)(SI), SP
   819  	MOVL	0(SP), AX
   820  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   821  	
   822  	// If the m on entry was nil, we called needm above to borrow an m
   823  	// for the duration of the call. Since the call is over, return it with dropm.
   824  	CMPL	DX, $0
   825  	JNE 3(PC)
   826  	MOVL	$runtime·dropm(SB), AX
   827  	CALL	AX
   828  
   829  	// Done!
   830  	RET
   831  
   832  // void setg(G*); set g. for use by needm.
   833  TEXT runtime·setg(SB), NOSPLIT, $0-4
   834  	MOVL	gg+0(FP), BX
   835  #ifdef GOOS_windows
   836  	CMPL	BX, $0
   837  	JNE	settls
   838  	MOVL	$0, 0x14(FS)
   839  	RET
   840  settls:
   841  	MOVL	g_m(BX), AX
   842  	LEAL	m_tls(AX), AX
   843  	MOVL	AX, 0x14(FS)
   844  #endif
   845  	get_tls(CX)
   846  	MOVL	BX, g(CX)
   847  	RET
   848  
   849  // void setg_gcc(G*); set g. for use by gcc
   850  TEXT setg_gcc<>(SB), NOSPLIT, $0
   851  	get_tls(AX)
   852  	MOVL	gg+0(FP), DX
   853  	MOVL	DX, g(AX)
   854  	RET
   855  
   856  // check that SP is in range [g->stack.lo, g->stack.hi)
   857  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   858  	get_tls(CX)
   859  	MOVL	g(CX), AX
   860  	CMPL	(g_stack+stack_hi)(AX), SP
   861  	JHI	2(PC)
   862  	INT	$3
   863  	CMPL	SP, (g_stack+stack_lo)(AX)
   864  	JHI	2(PC)
   865  	INT	$3
   866  	RET
   867  
   868  TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
   869  	MOVL	argp+0(FP),AX		// addr of first arg
   870  	MOVL	-4(AX),AX		// get calling pc
   871  	CMPL	AX, runtime·stackBarrierPC(SB)
   872  	JNE	nobar
   873  	// Get original return PC.
   874  	CALL	runtime·nextBarrierPC(SB)
   875  	MOVL	0(SP), AX
   876  nobar:
   877  	MOVL	AX, ret+4(FP)
   878  	RET
   879  
   880  TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
   881  	MOVL	argp+0(FP),AX		// addr of first arg
   882  	MOVL	pc+4(FP), BX
   883  	MOVL	-4(AX), CX
   884  	CMPL	CX, runtime·stackBarrierPC(SB)
   885  	JEQ	setbar
   886  	MOVL	BX, -4(AX)		// set calling pc
   887  	RET
   888  setbar:
   889  	// Set the stack barrier return PC.
   890  	MOVL	BX, 0(SP)
   891  	CALL	runtime·setNextBarrierPC(SB)
   892  	RET
   893  
   894  TEXT runtime·getcallersp(SB), NOSPLIT, $0-8
   895  	MOVL	argp+0(FP), AX
   896  	MOVL	AX, ret+4(FP)
   897  	RET
   898  
   899  // func cputicks() int64
   900  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   901  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence
   902  	JEQ	done
   903  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   904  	JNE	mfence
   905  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   906  	JMP	done
   907  mfence:
   908  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   909  done:
   910  	RDTSC
   911  	MOVL	AX, ret_lo+0(FP)
   912  	MOVL	DX, ret_hi+4(FP)
   913  	RET
   914  
   915  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   916  	// set up ldt 7 to point at tls0
   917  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   918  	// the entry number is just a hint.  setldt will set up GS with what it used.
   919  	MOVL	$7, 0(SP)
   920  	LEAL	runtime·tls0(SB), AX
   921  	MOVL	AX, 4(SP)
   922  	MOVL	$32, 8(SP)	// sizeof(tls array)
   923  	CALL	runtime·setldt(SB)
   924  	RET
   925  
   926  TEXT runtime·emptyfunc(SB),0,$0-0
   927  	RET
   928  
   929  TEXT runtime·abort(SB),NOSPLIT,$0-0
   930  	INT $0x3
   931  
   932  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   933  // redirects to memhash(p, h, size) using the size
   934  // stored in the closure.
   935  TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
   936  	GO_ARGS
   937  	NO_LOCAL_POINTERS
   938  	MOVL	p+0(FP), AX
   939  	MOVL	h+4(FP), BX
   940  	MOVL	4(DX), CX
   941  	MOVL	AX, 0(SP)
   942  	MOVL	BX, 4(SP)
   943  	MOVL	CX, 8(SP)
   944  	CALL	runtime·memhash(SB)
   945  	MOVL	12(SP), AX
   946  	MOVL	AX, ret+8(FP)
   947  	RET
   948  
   949  // hash function using AES hardware instructions
   950  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   951  	MOVL	p+0(FP), AX	// ptr to data
   952  	MOVL	s+8(FP), CX	// size
   953  	LEAL	ret+12(FP), DX
   954  	JMP	runtime·aeshashbody(SB)
   955  
   956  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   957  	MOVL	p+0(FP), AX	// ptr to string object
   958  	MOVL	4(AX), CX	// length of string
   959  	MOVL	(AX), AX	// string data
   960  	LEAL	ret+8(FP), DX
   961  	JMP	runtime·aeshashbody(SB)
   962  
   963  // AX: data
   964  // CX: length
   965  // DX: address to put return value
   966  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   967  	MOVL	h+4(FP), X6	// seed to low 64 bits of xmm6
   968  	PINSRD	$2, CX, X6	// size to high 64 bits of xmm6
   969  	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
   970  	MOVO	runtime·aeskeysched(SB), X7
   971  	CMPL	CX, $16
   972  	JB	aes0to15
   973  	JE	aes16
   974  	CMPL	CX, $32
   975  	JBE	aes17to32
   976  	CMPL	CX, $64
   977  	JBE	aes33to64
   978  	JMP	aes65plus
   979  	
   980  aes0to15:
   981  	TESTL	CX, CX
   982  	JE	aes0
   983  
   984  	ADDL	$16, AX
   985  	TESTW	$0xff0, AX
   986  	JE	endofpage
   987  
   988  	// 16 bytes loaded at this address won't cross
   989  	// a page boundary, so we can load it directly.
   990  	MOVOU	-16(AX), X0
   991  	ADDL	CX, CX
   992  	PAND	masks<>(SB)(CX*8), X0
   993  
   994  	// scramble 3 times
   995  	AESENC	X6, X0
   996  	AESENC	X7, X0
   997  	AESENC	X7, X0
   998  	MOVL	X0, (DX)
   999  	RET
  1000  
  1001  endofpage:
  1002  	// address ends in 1111xxxx.  Might be up against
  1003  	// a page boundary, so load ending at last byte.
  1004  	// Then shift bytes down using pshufb.
  1005  	MOVOU	-32(AX)(CX*1), X0
  1006  	ADDL	CX, CX
  1007  	PSHUFB	shifts<>(SB)(CX*8), X0
  1008  	AESENC	X6, X0
  1009  	AESENC	X7, X0
  1010  	AESENC	X7, X0
  1011  	MOVL	X0, (DX)
  1012  	RET
  1013  
  1014  aes0:
  1015  	// Return scrambled input seed
  1016  	AESENC	X7, X6
  1017  	AESENC	X7, X6
  1018  	MOVL	X6, (DX)
  1019  	RET
  1020  
  1021  aes16:
  1022  	MOVOU	(AX), X0
  1023  	AESENC	X6, X0
  1024  	AESENC	X7, X0
  1025  	AESENC	X7, X0
  1026  	MOVL	X0, (DX)
  1027  	RET
  1028  
  1029  
  1030  aes17to32:
  1031  	// load data to be hashed
  1032  	MOVOU	(AX), X0
  1033  	MOVOU	-16(AX)(CX*1), X1
  1034  
  1035  	// scramble 3 times
  1036  	AESENC	X6, X0
  1037  	AESENC	runtime·aeskeysched+16(SB), X1
  1038  	AESENC	X7, X0
  1039  	AESENC	X7, X1
  1040  	AESENC	X7, X0
  1041  	AESENC	X7, X1
  1042  
  1043  	// combine results
  1044  	PXOR	X1, X0
  1045  	MOVL	X0, (DX)
  1046  	RET
  1047  
  1048  aes33to64:
  1049  	MOVOU	(AX), X0
  1050  	MOVOU	16(AX), X1
  1051  	MOVOU	-32(AX)(CX*1), X2
  1052  	MOVOU	-16(AX)(CX*1), X3
  1053  	
  1054  	AESENC	X6, X0
  1055  	AESENC	runtime·aeskeysched+16(SB), X1
  1056  	AESENC	runtime·aeskeysched+32(SB), X2
  1057  	AESENC	runtime·aeskeysched+48(SB), X3
  1058  	AESENC	X7, X0
  1059  	AESENC	X7, X1
  1060  	AESENC	X7, X2
  1061  	AESENC	X7, X3
  1062  	AESENC	X7, X0
  1063  	AESENC	X7, X1
  1064  	AESENC	X7, X2
  1065  	AESENC	X7, X3
  1066  
  1067  	PXOR	X2, X0
  1068  	PXOR	X3, X1
  1069  	PXOR	X1, X0
  1070  	MOVL	X0, (DX)
  1071  	RET
  1072  
  1073  aes65plus:
  1074  	// start with last (possibly overlapping) block
  1075  	MOVOU	-64(AX)(CX*1), X0
  1076  	MOVOU	-48(AX)(CX*1), X1
  1077  	MOVOU	-32(AX)(CX*1), X2
  1078  	MOVOU	-16(AX)(CX*1), X3
  1079  
  1080  	// scramble state once
  1081  	AESENC	X6, X0
  1082  	AESENC	runtime·aeskeysched+16(SB), X1
  1083  	AESENC	runtime·aeskeysched+32(SB), X2
  1084  	AESENC	runtime·aeskeysched+48(SB), X3
  1085  
  1086  	// compute number of remaining 64-byte blocks
  1087  	DECL	CX
  1088  	SHRL	$6, CX
  1089  	
  1090  aesloop:
  1091  	// scramble state, xor in a block
  1092  	MOVOU	(AX), X4
  1093  	MOVOU	16(AX), X5
  1094  	AESENC	X4, X0
  1095  	AESENC	X5, X1
  1096  	MOVOU	32(AX), X4
  1097  	MOVOU	48(AX), X5
  1098  	AESENC	X4, X2
  1099  	AESENC	X5, X3
  1100  
  1101  	// scramble state
  1102  	AESENC	X7, X0
  1103  	AESENC	X7, X1
  1104  	AESENC	X7, X2
  1105  	AESENC	X7, X3
  1106  
  1107  	ADDL	$64, AX
  1108  	DECL	CX
  1109  	JNE	aesloop
  1110  
  1111  	// 2 more scrambles to finish
  1112  	AESENC	X7, X0
  1113  	AESENC	X7, X1
  1114  	AESENC	X7, X2
  1115  	AESENC	X7, X3
  1116  	AESENC	X7, X0
  1117  	AESENC	X7, X1
  1118  	AESENC	X7, X2
  1119  	AESENC	X7, X3
  1120  
  1121  	PXOR	X2, X0
  1122  	PXOR	X3, X1
  1123  	PXOR	X1, X0
  1124  	MOVL	X0, (DX)
  1125  	RET
  1126  
  1127  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1128  	MOVL	p+0(FP), AX	// ptr to data
  1129  	MOVL	h+4(FP), X0	// seed
  1130  	PINSRD	$1, (AX), X0	// data
  1131  	AESENC	runtime·aeskeysched+0(SB), X0
  1132  	AESENC	runtime·aeskeysched+16(SB), X0
  1133  	AESENC	runtime·aeskeysched+32(SB), X0
  1134  	MOVL	X0, ret+8(FP)
  1135  	RET
  1136  
  1137  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1138  	MOVL	p+0(FP), AX	// ptr to data
  1139  	MOVQ	(AX), X0	// data
  1140  	PINSRD	$2, h+4(FP), X0	// seed
  1141  	AESENC	runtime·aeskeysched+0(SB), X0
  1142  	AESENC	runtime·aeskeysched+16(SB), X0
  1143  	AESENC	runtime·aeskeysched+32(SB), X0
  1144  	MOVL	X0, ret+8(FP)
  1145  	RET
  1146  
  1147  // simple mask to get rid of data in the high part of the register.
  1148  DATA masks<>+0x00(SB)/4, $0x00000000
  1149  DATA masks<>+0x04(SB)/4, $0x00000000
  1150  DATA masks<>+0x08(SB)/4, $0x00000000
  1151  DATA masks<>+0x0c(SB)/4, $0x00000000
  1152  	
  1153  DATA masks<>+0x10(SB)/4, $0x000000ff
  1154  DATA masks<>+0x14(SB)/4, $0x00000000
  1155  DATA masks<>+0x18(SB)/4, $0x00000000
  1156  DATA masks<>+0x1c(SB)/4, $0x00000000
  1157  	
  1158  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1159  DATA masks<>+0x24(SB)/4, $0x00000000
  1160  DATA masks<>+0x28(SB)/4, $0x00000000
  1161  DATA masks<>+0x2c(SB)/4, $0x00000000
  1162  	
  1163  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1164  DATA masks<>+0x34(SB)/4, $0x00000000
  1165  DATA masks<>+0x38(SB)/4, $0x00000000
  1166  DATA masks<>+0x3c(SB)/4, $0x00000000
  1167  	
  1168  DATA masks<>+0x40(SB)/4, $0xffffffff
  1169  DATA masks<>+0x44(SB)/4, $0x00000000
  1170  DATA masks<>+0x48(SB)/4, $0x00000000
  1171  DATA masks<>+0x4c(SB)/4, $0x00000000
  1172  	
  1173  DATA masks<>+0x50(SB)/4, $0xffffffff
  1174  DATA masks<>+0x54(SB)/4, $0x000000ff
  1175  DATA masks<>+0x58(SB)/4, $0x00000000
  1176  DATA masks<>+0x5c(SB)/4, $0x00000000
  1177  	
  1178  DATA masks<>+0x60(SB)/4, $0xffffffff
  1179  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1180  DATA masks<>+0x68(SB)/4, $0x00000000
  1181  DATA masks<>+0x6c(SB)/4, $0x00000000
  1182  	
  1183  DATA masks<>+0x70(SB)/4, $0xffffffff
  1184  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1185  DATA masks<>+0x78(SB)/4, $0x00000000
  1186  DATA masks<>+0x7c(SB)/4, $0x00000000
  1187  	
  1188  DATA masks<>+0x80(SB)/4, $0xffffffff
  1189  DATA masks<>+0x84(SB)/4, $0xffffffff
  1190  DATA masks<>+0x88(SB)/4, $0x00000000
  1191  DATA masks<>+0x8c(SB)/4, $0x00000000
  1192  	
  1193  DATA masks<>+0x90(SB)/4, $0xffffffff
  1194  DATA masks<>+0x94(SB)/4, $0xffffffff
  1195  DATA masks<>+0x98(SB)/4, $0x000000ff
  1196  DATA masks<>+0x9c(SB)/4, $0x00000000
  1197  	
  1198  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1199  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1200  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1201  DATA masks<>+0xac(SB)/4, $0x00000000
  1202  	
  1203  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1204  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1205  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1206  DATA masks<>+0xbc(SB)/4, $0x00000000
  1207  	
  1208  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1209  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1210  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1211  DATA masks<>+0xcc(SB)/4, $0x00000000
  1212  	
  1213  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1214  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1215  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1216  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1217  	
  1218  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1219  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1220  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1221  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1222  	
  1223  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1224  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1225  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1226  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1227  
  1228  GLOBL masks<>(SB),RODATA,$256
  1229  
  1230  // these are arguments to pshufb.  They move data down from
  1231  // the high bytes of the register to the low bytes of the register.
  1232  // index is how many bytes to move.
  1233  DATA shifts<>+0x00(SB)/4, $0x00000000
  1234  DATA shifts<>+0x04(SB)/4, $0x00000000
  1235  DATA shifts<>+0x08(SB)/4, $0x00000000
  1236  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1237  	
  1238  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1239  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1240  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1241  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1242  	
  1243  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1244  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1245  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1246  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1247  	
  1248  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1249  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1250  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1251  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1252  	
  1253  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1254  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1255  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1256  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1257  	
  1258  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1259  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1260  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1261  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1262  	
  1263  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1264  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1265  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1266  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1267  	
  1268  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1269  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1270  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1271  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1272  	
  1273  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1274  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1275  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1276  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1277  	
  1278  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1279  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1280  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1281  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1282  	
  1283  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1284  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1285  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1286  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1287  	
  1288  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1289  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1290  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1291  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1292  	
  1293  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1294  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1295  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1296  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1297  	
  1298  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1299  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1300  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1301  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1302  	
  1303  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1304  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1305  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1306  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1307  	
  1308  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1309  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1310  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1311  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1312  
  1313  GLOBL shifts<>(SB),RODATA,$256
  1314  
  1315  TEXT runtime·memeq(SB),NOSPLIT,$0-13
  1316  	MOVL	a+0(FP), SI
  1317  	MOVL	b+4(FP), DI
  1318  	MOVL	size+8(FP), BX
  1319  	LEAL	ret+12(FP), AX
  1320  	JMP	runtime·memeqbody(SB)
  1321  
  1322  // memequal_varlen(a, b unsafe.Pointer) bool
  1323  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1324  	MOVL    a+0(FP), SI
  1325  	MOVL    b+4(FP), DI
  1326  	CMPL    SI, DI
  1327  	JEQ     eq
  1328  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1329  	LEAL	ret+8(FP), AX
  1330  	JMP	runtime·memeqbody(SB)
  1331  eq:
  1332  	MOVB    $1, ret+8(FP)
  1333  	RET
  1334  
  1335  // eqstring tests whether two strings are equal.
  1336  // The compiler guarantees that strings passed
  1337  // to eqstring have equal length.
  1338  // See runtime_test.go:eqstring_generic for
  1339  // equivalent Go code.
  1340  TEXT runtime·eqstring(SB),NOSPLIT,$0-17
  1341  	MOVL	s1str+0(FP), SI
  1342  	MOVL	s2str+8(FP), DI
  1343  	CMPL	SI, DI
  1344  	JEQ	same
  1345  	MOVL	s1len+4(FP), BX
  1346  	LEAL	v+16(FP), AX
  1347  	JMP	runtime·memeqbody(SB)
  1348  same:
  1349  	MOVB	$1, v+16(FP)
  1350  	RET
  1351  
  1352  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1353  	MOVL	a_len+4(FP), BX
  1354  	MOVL	b_len+16(FP), CX
  1355  	CMPL	BX, CX
  1356  	JNE	eqret
  1357  	MOVL	a+0(FP), SI
  1358  	MOVL	b+12(FP), DI
  1359  	LEAL	ret+24(FP), AX
  1360  	JMP	runtime·memeqbody(SB)
  1361  eqret:
  1362  	MOVB	$0, ret+24(FP)
  1363  	RET
  1364  
  1365  // a in SI
  1366  // b in DI
  1367  // count in BX
  1368  // address of result byte in AX
  1369  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1370  	CMPL	BX, $4
  1371  	JB	small
  1372  
  1373  	// 64 bytes at a time using xmm registers
  1374  hugeloop:
  1375  	CMPL	BX, $64
  1376  	JB	bigloop
  1377  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1378  	JE	bigloop
  1379  	MOVOU	(SI), X0
  1380  	MOVOU	(DI), X1
  1381  	MOVOU	16(SI), X2
  1382  	MOVOU	16(DI), X3
  1383  	MOVOU	32(SI), X4
  1384  	MOVOU	32(DI), X5
  1385  	MOVOU	48(SI), X6
  1386  	MOVOU	48(DI), X7
  1387  	PCMPEQB	X1, X0
  1388  	PCMPEQB	X3, X2
  1389  	PCMPEQB	X5, X4
  1390  	PCMPEQB	X7, X6
  1391  	PAND	X2, X0
  1392  	PAND	X6, X4
  1393  	PAND	X4, X0
  1394  	PMOVMSKB X0, DX
  1395  	ADDL	$64, SI
  1396  	ADDL	$64, DI
  1397  	SUBL	$64, BX
  1398  	CMPL	DX, $0xffff
  1399  	JEQ	hugeloop
  1400  	MOVB	$0, (AX)
  1401  	RET
  1402  
  1403  	// 4 bytes at a time using 32-bit register
  1404  bigloop:
  1405  	CMPL	BX, $4
  1406  	JBE	leftover
  1407  	MOVL	(SI), CX
  1408  	MOVL	(DI), DX
  1409  	ADDL	$4, SI
  1410  	ADDL	$4, DI
  1411  	SUBL	$4, BX
  1412  	CMPL	CX, DX
  1413  	JEQ	bigloop
  1414  	MOVB	$0, (AX)
  1415  	RET
  1416  
  1417  	// remaining 0-4 bytes
  1418  leftover:
  1419  	MOVL	-4(SI)(BX*1), CX
  1420  	MOVL	-4(DI)(BX*1), DX
  1421  	CMPL	CX, DX
  1422  	SETEQ	(AX)
  1423  	RET
  1424  
  1425  small:
  1426  	CMPL	BX, $0
  1427  	JEQ	equal
  1428  
  1429  	LEAL	0(BX*8), CX
  1430  	NEGL	CX
  1431  
  1432  	MOVL	SI, DX
  1433  	CMPB	DX, $0xfc
  1434  	JA	si_high
  1435  
  1436  	// load at SI won't cross a page boundary.
  1437  	MOVL	(SI), SI
  1438  	JMP	si_finish
  1439  si_high:
  1440  	// address ends in 111111xx.  Load up to bytes we want, move to correct position.
  1441  	MOVL	-4(SI)(BX*1), SI
  1442  	SHRL	CX, SI
  1443  si_finish:
  1444  
  1445  	// same for DI.
  1446  	MOVL	DI, DX
  1447  	CMPB	DX, $0xfc
  1448  	JA	di_high
  1449  	MOVL	(DI), DI
  1450  	JMP	di_finish
  1451  di_high:
  1452  	MOVL	-4(DI)(BX*1), DI
  1453  	SHRL	CX, DI
  1454  di_finish:
  1455  
  1456  	SUBL	SI, DI
  1457  	SHLL	CX, DI
  1458  equal:
  1459  	SETEQ	(AX)
  1460  	RET
  1461  
  1462  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1463  	MOVL	s1_base+0(FP), SI
  1464  	MOVL	s1_len+4(FP), BX
  1465  	MOVL	s2_base+8(FP), DI
  1466  	MOVL	s2_len+12(FP), DX
  1467  	LEAL	ret+16(FP), AX
  1468  	JMP	runtime·cmpbody(SB)
  1469  
  1470  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1471  	MOVL	s1+0(FP), SI
  1472  	MOVL	s1+4(FP), BX
  1473  	MOVL	s2+12(FP), DI
  1474  	MOVL	s2+16(FP), DX
  1475  	LEAL	ret+24(FP), AX
  1476  	JMP	runtime·cmpbody(SB)
  1477  
  1478  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1479  	MOVL	s+0(FP), SI
  1480  	MOVL	s_len+4(FP), CX
  1481  	MOVB	c+12(FP), AL
  1482  	MOVL	SI, DI
  1483  	CLD; REPN; SCASB
  1484  	JZ 3(PC)
  1485  	MOVL	$-1, ret+16(FP)
  1486  	RET
  1487  	SUBL	SI, DI
  1488  	SUBL	$1, DI
  1489  	MOVL	DI, ret+16(FP)
  1490  	RET
  1491  
  1492  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1493  	MOVL	s+0(FP), SI
  1494  	MOVL	s_len+4(FP), CX
  1495  	MOVB	c+8(FP), AL
  1496  	MOVL	SI, DI
  1497  	CLD; REPN; SCASB
  1498  	JZ 3(PC)
  1499  	MOVL	$-1, ret+12(FP)
  1500  	RET
  1501  	SUBL	SI, DI
  1502  	SUBL	$1, DI
  1503  	MOVL	DI, ret+12(FP)
  1504  	RET
  1505  
  1506  // input:
  1507  //   SI = a
  1508  //   DI = b
  1509  //   BX = alen
  1510  //   DX = blen
  1511  //   AX = address of return word (set to 1/0/-1)
  1512  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1513  	MOVL	DX, BP
  1514  	SUBL	BX, DX // DX = blen-alen
  1515  	CMOVLGT	BX, BP // BP = min(alen, blen)
  1516  	CMPL	SI, DI
  1517  	JEQ	allsame
  1518  	CMPL	BP, $4
  1519  	JB	small
  1520  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1521  	JE	mediumloop
  1522  largeloop:
  1523  	CMPL	BP, $16
  1524  	JB	mediumloop
  1525  	MOVOU	(SI), X0
  1526  	MOVOU	(DI), X1
  1527  	PCMPEQB X0, X1
  1528  	PMOVMSKB X1, BX
  1529  	XORL	$0xffff, BX	// convert EQ to NE
  1530  	JNE	diff16	// branch if at least one byte is not equal
  1531  	ADDL	$16, SI
  1532  	ADDL	$16, DI
  1533  	SUBL	$16, BP
  1534  	JMP	largeloop
  1535  
  1536  diff16:
  1537  	BSFL	BX, BX	// index of first byte that differs
  1538  	XORL	DX, DX
  1539  	MOVB	(SI)(BX*1), CX
  1540  	CMPB	CX, (DI)(BX*1)
  1541  	SETHI	DX
  1542  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1543  	MOVL	DX, (AX)
  1544  	RET
  1545  
  1546  mediumloop:
  1547  	CMPL	BP, $4
  1548  	JBE	_0through4
  1549  	MOVL	(SI), BX
  1550  	MOVL	(DI), CX
  1551  	CMPL	BX, CX
  1552  	JNE	diff4
  1553  	ADDL	$4, SI
  1554  	ADDL	$4, DI
  1555  	SUBL	$4, BP
  1556  	JMP	mediumloop
  1557  
  1558  _0through4:
  1559  	MOVL	-4(SI)(BP*1), BX
  1560  	MOVL	-4(DI)(BP*1), CX
  1561  	CMPL	BX, CX
  1562  	JEQ	allsame
  1563  
  1564  diff4:
  1565  	BSWAPL	BX	// reverse order of bytes
  1566  	BSWAPL	CX
  1567  	XORL	BX, CX	// find bit differences
  1568  	BSRL	CX, CX	// index of highest bit difference
  1569  	SHRL	CX, BX	// move a's bit to bottom
  1570  	ANDL	$1, BX	// mask bit
  1571  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1572  	MOVL	BX, (AX)
  1573  	RET
  1574  
  1575  	// 0-3 bytes in common
  1576  small:
  1577  	LEAL	(BP*8), CX
  1578  	NEGL	CX
  1579  	JEQ	allsame
  1580  
  1581  	// load si
  1582  	CMPB	SI, $0xfc
  1583  	JA	si_high
  1584  	MOVL	(SI), SI
  1585  	JMP	si_finish
  1586  si_high:
  1587  	MOVL	-4(SI)(BP*1), SI
  1588  	SHRL	CX, SI
  1589  si_finish:
  1590  	SHLL	CX, SI
  1591  
  1592  	// same for di
  1593  	CMPB	DI, $0xfc
  1594  	JA	di_high
  1595  	MOVL	(DI), DI
  1596  	JMP	di_finish
  1597  di_high:
  1598  	MOVL	-4(DI)(BP*1), DI
  1599  	SHRL	CX, DI
  1600  di_finish:
  1601  	SHLL	CX, DI
  1602  
  1603  	BSWAPL	SI	// reverse order of bytes
  1604  	BSWAPL	DI
  1605  	XORL	SI, DI	// find bit differences
  1606  	JEQ	allsame
  1607  	BSRL	DI, CX	// index of highest bit difference
  1608  	SHRL	CX, SI	// move a's bit to bottom
  1609  	ANDL	$1, SI	// mask bit
  1610  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1611  	MOVL	BX, (AX)
  1612  	RET
  1613  
  1614  	// all the bytes in common are the same, so we just need
  1615  	// to compare the lengths.
  1616  allsame:
  1617  	XORL	BX, BX
  1618  	XORL	CX, CX
  1619  	TESTL	DX, DX
  1620  	SETLT	BX	// 1 if alen > blen
  1621  	SETEQ	CX	// 1 if alen == blen
  1622  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1623  	MOVL	BX, (AX)
  1624  	RET
  1625  
  1626  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1627  	get_tls(CX)
  1628  	MOVL	g(CX), AX
  1629  	MOVL	g_m(AX), AX
  1630  	MOVL	m_fastrand(AX), DX
  1631  	ADDL	DX, DX
  1632  	MOVL	DX, BX
  1633  	XORL	$0x88888eef, DX
  1634  	CMOVLMI	BX, DX
  1635  	MOVL	DX, m_fastrand(AX)
  1636  	MOVL	DX, ret+0(FP)
  1637  	RET
  1638  
  1639  TEXT runtime·return0(SB), NOSPLIT, $0
  1640  	MOVL	$0, AX
  1641  	RET
  1642  
  1643  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1644  // Must obey the gcc calling convention.
  1645  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1646  	get_tls(CX)
  1647  	MOVL	g(CX), AX
  1648  	MOVL	g_m(AX), AX
  1649  	MOVL	m_curg(AX), AX
  1650  	MOVL	(g_stack+stack_hi)(AX), AX
  1651  	RET
  1652  
  1653  // The top-most function running on a goroutine
  1654  // returns to goexit+PCQuantum.
  1655  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1656  	BYTE	$0x90	// NOP
  1657  	CALL	runtime·goexit1(SB)	// does not return
  1658  	// traceback from goexit1 must hit code range of goexit
  1659  	BYTE	$0x90	// NOP
  1660  
  1661  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
  1662  	MOVL	addr+0(FP), AX
  1663  	PREFETCHT0	(AX)
  1664  	RET
  1665  
  1666  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
  1667  	MOVL	addr+0(FP), AX
  1668  	PREFETCHT1	(AX)
  1669  	RET
  1670  
  1671  
  1672  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
  1673  	MOVL	addr+0(FP), AX
  1674  	PREFETCHT2	(AX)
  1675  	RET
  1676  
  1677  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
  1678  	MOVL	addr+0(FP), AX
  1679  	PREFETCHNTA	(AX)
  1680  	RET