github.com/alash3al/go@v0.0.0-20150827002835-d497eeb00540/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	SUBL	$128, SP		// plenty of scratch
    15  	ANDL	$~15, SP
    16  	MOVL	AX, 120(SP)		// save argc, argv away
    17  	MOVL	BX, 124(SP)
    18  
    19  	// set default stack bounds.
    20  	// _cgo_init may update stackguard.
    21  	MOVL	$runtime·g0(SB), BP
    22  	LEAL	(-64*1024+104)(SP), BX
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	BX, g_stackguard1(BP)
    25  	MOVL	BX, (g_stack+stack_lo)(BP)
    26  	MOVL	SP, (g_stack+stack_hi)(BP)
    27  	
    28  	// find out information about the processor we're on
    29  	MOVL	$0, AX
    30  	CPUID
    31  	CMPL	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  
    46  	MOVL	$1, AX
    47  	CPUID
    48  	MOVL	CX, runtime·cpuid_ecx(SB)
    49  	MOVL	DX, runtime·cpuid_edx(SB)
    50  nocpuinfo:	
    51  
    52  	// if there is an _cgo_init, call it to let it
    53  	// initialize and to set up GS.  if not,
    54  	// we set up GS ourselves.
    55  	MOVL	_cgo_init(SB), AX
    56  	TESTL	AX, AX
    57  	JZ	needtls
    58  	MOVL	$setg_gcc<>(SB), BX
    59  	MOVL	BX, 4(SP)
    60  	MOVL	BP, 0(SP)
    61  	CALL	AX
    62  
    63  	// update stackguard after _cgo_init
    64  	MOVL	$runtime·g0(SB), CX
    65  	MOVL	(g_stack+stack_lo)(CX), AX
    66  	ADDL	$const__StackGuard, AX
    67  	MOVL	AX, g_stackguard0(CX)
    68  	MOVL	AX, g_stackguard1(CX)
    69  
    70  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
    71  	CMPL runtime·iswindows(SB), $0
    72  	JEQ ok
    73  needtls:
    74  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
    75  	CMPL	runtime·isplan9(SB), $1
    76  	JEQ	ok
    77  
    78  	// set up %gs
    79  	CALL	runtime·ldt0setup(SB)
    80  
    81  	// store through it, to make sure it works
    82  	get_tls(BX)
    83  	MOVL	$0x123, g(BX)
    84  	MOVL	runtime·tls0(SB), AX
    85  	CMPL	AX, $0x123
    86  	JEQ	ok
    87  	MOVL	AX, 0	// abort
    88  ok:
    89  	// set up m and g "registers"
    90  	get_tls(BX)
    91  	LEAL	runtime·g0(SB), CX
    92  	MOVL	CX, g(BX)
    93  	LEAL	runtime·m0(SB), AX
    94  
    95  	// save m->g0 = g0
    96  	MOVL	CX, m_g0(AX)
    97  	// save g0->m = m0
    98  	MOVL	AX, g_m(CX)
    99  
   100  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   101  
   102  	// convention is D is always cleared
   103  	CLD
   104  
   105  	CALL	runtime·check(SB)
   106  
   107  	// saved argc, argv
   108  	MOVL	120(SP), AX
   109  	MOVL	AX, 0(SP)
   110  	MOVL	124(SP), AX
   111  	MOVL	AX, 4(SP)
   112  	CALL	runtime·args(SB)
   113  	CALL	runtime·osinit(SB)
   114  	CALL	runtime·schedinit(SB)
   115  
   116  	// create a new goroutine to start program
   117  	PUSHL	$runtime·mainPC(SB)	// entry
   118  	PUSHL	$0	// arg size
   119  	CALL	runtime·newproc(SB)
   120  	POPL	AX
   121  	POPL	AX
   122  
   123  	// start this M
   124  	CALL	runtime·mstart(SB)
   125  
   126  	INT $3
   127  	RET
   128  
   129  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   130  GLOBL	runtime·mainPC(SB),RODATA,$4
   131  
   132  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   133  	INT $3
   134  	RET
   135  
   136  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   137  	// Linux and MinGW start the FPU in extended double precision.
   138  	// Other operating systems use double precision.
   139  	// Change to double precision to match them,
   140  	// and to match other hardware that only has double.
   141  	PUSHL $0x27F
   142  	FLDCW	0(SP)
   143  	POPL AX
   144  	RET
   145  
   146  /*
   147   *  go-routine
   148   */
   149  
   150  // void gosave(Gobuf*)
   151  // save state in Gobuf; setjmp
   152  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   153  	MOVL	buf+0(FP), AX		// gobuf
   154  	LEAL	buf+0(FP), BX		// caller's SP
   155  	MOVL	BX, gobuf_sp(AX)
   156  	MOVL	0(SP), BX		// caller's PC
   157  	MOVL	BX, gobuf_pc(AX)
   158  	MOVL	$0, gobuf_ret(AX)
   159  	MOVL	$0, gobuf_ctxt(AX)
   160  	get_tls(CX)
   161  	MOVL	g(CX), BX
   162  	MOVL	BX, gobuf_g(AX)
   163  	RET
   164  
   165  // void gogo(Gobuf*)
   166  // restore state from Gobuf; longjmp
   167  TEXT runtime·gogo(SB), NOSPLIT, $0-4
   168  	MOVL	buf+0(FP), BX		// gobuf
   169  	MOVL	gobuf_g(BX), DX
   170  	MOVL	0(DX), CX		// make sure g != nil
   171  	get_tls(CX)
   172  	MOVL	DX, g(CX)
   173  	MOVL	gobuf_sp(BX), SP	// restore SP
   174  	MOVL	gobuf_ret(BX), AX
   175  	MOVL	gobuf_ctxt(BX), DX
   176  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   177  	MOVL	$0, gobuf_ret(BX)
   178  	MOVL	$0, gobuf_ctxt(BX)
   179  	MOVL	gobuf_pc(BX), BX
   180  	JMP	BX
   181  
   182  // func mcall(fn func(*g))
   183  // Switch to m->g0's stack, call fn(g).
   184  // Fn must never return.  It should gogo(&g->sched)
   185  // to keep running g.
   186  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   187  	MOVL	fn+0(FP), DI
   188  	
   189  	get_tls(CX)
   190  	MOVL	g(CX), AX	// save state in g->sched
   191  	MOVL	0(SP), BX	// caller's PC
   192  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   193  	LEAL	fn+0(FP), BX	// caller's SP
   194  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   195  	MOVL	AX, (g_sched+gobuf_g)(AX)
   196  
   197  	// switch to m->g0 & its stack, call fn
   198  	MOVL	g(CX), BX
   199  	MOVL	g_m(BX), BX
   200  	MOVL	m_g0(BX), SI
   201  	CMPL	SI, AX	// if g == m->g0 call badmcall
   202  	JNE	3(PC)
   203  	MOVL	$runtime·badmcall(SB), AX
   204  	JMP	AX
   205  	MOVL	SI, g(CX)	// g = m->g0
   206  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   207  	PUSHL	AX
   208  	MOVL	DI, DX
   209  	MOVL	0(DI), DI
   210  	CALL	DI
   211  	POPL	AX
   212  	MOVL	$runtime·badmcall2(SB), AX
   213  	JMP	AX
   214  	RET
   215  
   216  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   217  // of the G stack.  We need to distinguish the routine that
   218  // lives at the bottom of the G stack from the one that lives
   219  // at the top of the system stack because the one at the top of
   220  // the system stack terminates the stack walk (see topofstack()).
   221  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   222  	RET
   223  
   224  // func systemstack(fn func())
   225  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   226  	MOVL	fn+0(FP), DI	// DI = fn
   227  	get_tls(CX)
   228  	MOVL	g(CX), AX	// AX = g
   229  	MOVL	g_m(AX), BX	// BX = m
   230  
   231  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   232  	CMPL	AX, DX
   233  	JEQ	noswitch
   234  
   235  	MOVL	m_g0(BX), DX	// DX = g0
   236  	CMPL	AX, DX
   237  	JEQ	noswitch
   238  
   239  	MOVL	m_curg(BX), BP
   240  	CMPL	AX, BP
   241  	JEQ	switch
   242  	
   243  	// Bad: g is not gsignal, not g0, not curg. What is it?
   244  	// Hide call from linker nosplit analysis.
   245  	MOVL	$runtime·badsystemstack(SB), AX
   246  	CALL	AX
   247  
   248  switch:
   249  	// save our state in g->sched.  Pretend to
   250  	// be systemstack_switch if the G stack is scanned.
   251  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   252  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   253  	MOVL	AX, (g_sched+gobuf_g)(AX)
   254  
   255  	// switch to g0
   256  	MOVL	DX, g(CX)
   257  	MOVL	(g_sched+gobuf_sp)(DX), BX
   258  	// make it look like mstart called systemstack on g0, to stop traceback
   259  	SUBL	$4, BX
   260  	MOVL	$runtime·mstart(SB), DX
   261  	MOVL	DX, 0(BX)
   262  	MOVL	BX, SP
   263  
   264  	// call target function
   265  	MOVL	DI, DX
   266  	MOVL	0(DI), DI
   267  	CALL	DI
   268  
   269  	// switch back to g
   270  	get_tls(CX)
   271  	MOVL	g(CX), AX
   272  	MOVL	g_m(AX), BX
   273  	MOVL	m_curg(BX), AX
   274  	MOVL	AX, g(CX)
   275  	MOVL	(g_sched+gobuf_sp)(AX), SP
   276  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   277  	RET
   278  
   279  noswitch:
   280  	// already on system stack, just call directly
   281  	MOVL	DI, DX
   282  	MOVL	0(DI), DI
   283  	CALL	DI
   284  	RET
   285  
   286  /*
   287   * support for morestack
   288   */
   289  
   290  // Called during function prolog when more stack is needed.
   291  //
   292  // The traceback routines see morestack on a g0 as being
   293  // the top of a stack (for example, morestack calling newstack
   294  // calling the scheduler calling newm calling gc), so we must
   295  // record an argument size. For that purpose, it has no arguments.
   296  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   297  	// Cannot grow scheduler stack (m->g0).
   298  	get_tls(CX)
   299  	MOVL	g(CX), BX
   300  	MOVL	g_m(BX), BX
   301  	MOVL	m_g0(BX), SI
   302  	CMPL	g(CX), SI
   303  	JNE	2(PC)
   304  	INT	$3
   305  
   306  	// Cannot grow signal stack.
   307  	MOVL	m_gsignal(BX), SI
   308  	CMPL	g(CX), SI
   309  	JNE	2(PC)
   310  	INT	$3
   311  
   312  	// Called from f.
   313  	// Set m->morebuf to f's caller.
   314  	MOVL	4(SP), DI	// f's caller's PC
   315  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   316  	LEAL	8(SP), CX	// f's caller's SP
   317  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   318  	get_tls(CX)
   319  	MOVL	g(CX), SI
   320  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   321  
   322  	// Set g->sched to context in f.
   323  	MOVL	0(SP), AX	// f's PC
   324  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   325  	MOVL	SI, (g_sched+gobuf_g)(SI)
   326  	LEAL	4(SP), AX	// f's SP
   327  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   328  	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
   329  
   330  	// Call newstack on m->g0's stack.
   331  	MOVL	m_g0(BX), BP
   332  	MOVL	BP, g(CX)
   333  	MOVL	(g_sched+gobuf_sp)(BP), AX
   334  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   335  	MOVL	AX, SP
   336  	CALL	runtime·newstack(SB)
   337  	MOVL	$0, 0x1003	// crash if newstack returns
   338  	RET
   339  
   340  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   341  	MOVL	$0, DX
   342  	JMP runtime·morestack(SB)
   343  
   344  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   345  	// We came here via a RET to an overwritten return PC.
   346  	// AX may be live. Other registers are available.
   347  
   348  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   349  	get_tls(CX)
   350  	MOVL	g(CX), CX
   351  	MOVL	(g_stkbar+slice_array)(CX), DX
   352  	MOVL	g_stkbarPos(CX), BX
   353  	IMULL	$stkbar__size, BX	// Too big for SIB.
   354  	MOVL	stkbar_savedLRVal(DX)(BX*1), BX
   355  	// Record that this stack barrier was hit.
   356  	ADDL	$1, g_stkbarPos(CX)
   357  	// Jump to the original return PC.
   358  	JMP	BX
   359  
   360  // reflectcall: call a function with the given argument list
   361  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   362  // we don't have variable-sized frames, so we use a small number
   363  // of constant-sized-frame functions to encode a few bits of size in the pc.
   364  // Caution: ugly multiline assembly macros in your future!
   365  
   366  #define DISPATCH(NAME,MAXSIZE)		\
   367  	CMPL	CX, $MAXSIZE;		\
   368  	JA	3(PC);			\
   369  	MOVL	$NAME(SB), AX;		\
   370  	JMP	AX
   371  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   372  
   373  TEXT reflect·call(SB), NOSPLIT, $0-0
   374  	JMP	·reflectcall(SB)
   375  
   376  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   377  	MOVL	argsize+12(FP), CX
   378  	DISPATCH(runtime·call16, 16)
   379  	DISPATCH(runtime·call32, 32)
   380  	DISPATCH(runtime·call64, 64)
   381  	DISPATCH(runtime·call128, 128)
   382  	DISPATCH(runtime·call256, 256)
   383  	DISPATCH(runtime·call512, 512)
   384  	DISPATCH(runtime·call1024, 1024)
   385  	DISPATCH(runtime·call2048, 2048)
   386  	DISPATCH(runtime·call4096, 4096)
   387  	DISPATCH(runtime·call8192, 8192)
   388  	DISPATCH(runtime·call16384, 16384)
   389  	DISPATCH(runtime·call32768, 32768)
   390  	DISPATCH(runtime·call65536, 65536)
   391  	DISPATCH(runtime·call131072, 131072)
   392  	DISPATCH(runtime·call262144, 262144)
   393  	DISPATCH(runtime·call524288, 524288)
   394  	DISPATCH(runtime·call1048576, 1048576)
   395  	DISPATCH(runtime·call2097152, 2097152)
   396  	DISPATCH(runtime·call4194304, 4194304)
   397  	DISPATCH(runtime·call8388608, 8388608)
   398  	DISPATCH(runtime·call16777216, 16777216)
   399  	DISPATCH(runtime·call33554432, 33554432)
   400  	DISPATCH(runtime·call67108864, 67108864)
   401  	DISPATCH(runtime·call134217728, 134217728)
   402  	DISPATCH(runtime·call268435456, 268435456)
   403  	DISPATCH(runtime·call536870912, 536870912)
   404  	DISPATCH(runtime·call1073741824, 1073741824)
   405  	MOVL	$runtime·badreflectcall(SB), AX
   406  	JMP	AX
   407  
   408  #define CALLFN(NAME,MAXSIZE)			\
   409  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   410  	NO_LOCAL_POINTERS;			\
   411  	/* copy arguments to stack */		\
   412  	MOVL	argptr+8(FP), SI;		\
   413  	MOVL	argsize+12(FP), CX;		\
   414  	MOVL	SP, DI;				\
   415  	REP;MOVSB;				\
   416  	/* call function */			\
   417  	MOVL	f+4(FP), DX;			\
   418  	MOVL	(DX), AX; 			\
   419  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   420  	CALL	AX;				\
   421  	/* copy return values back */		\
   422  	MOVL	argptr+8(FP), DI;		\
   423  	MOVL	argsize+12(FP), CX;		\
   424  	MOVL	retoffset+16(FP), BX;		\
   425  	MOVL	SP, SI;				\
   426  	ADDL	BX, DI;				\
   427  	ADDL	BX, SI;				\
   428  	SUBL	BX, CX;				\
   429  	REP;MOVSB;				\
   430  	/* execute write barrier updates */	\
   431  	MOVL	argtype+0(FP), DX;		\
   432  	MOVL	argptr+8(FP), DI;		\
   433  	MOVL	argsize+12(FP), CX;		\
   434  	MOVL	retoffset+16(FP), BX;		\
   435  	MOVL	DX, 0(SP);			\
   436  	MOVL	DI, 4(SP);			\
   437  	MOVL	CX, 8(SP);			\
   438  	MOVL	BX, 12(SP);			\
   439  	CALL	runtime·callwritebarrier(SB);	\
   440  	RET
   441  
   442  CALLFN(·call16, 16)
   443  CALLFN(·call32, 32)
   444  CALLFN(·call64, 64)
   445  CALLFN(·call128, 128)
   446  CALLFN(·call256, 256)
   447  CALLFN(·call512, 512)
   448  CALLFN(·call1024, 1024)
   449  CALLFN(·call2048, 2048)
   450  CALLFN(·call4096, 4096)
   451  CALLFN(·call8192, 8192)
   452  CALLFN(·call16384, 16384)
   453  CALLFN(·call32768, 32768)
   454  CALLFN(·call65536, 65536)
   455  CALLFN(·call131072, 131072)
   456  CALLFN(·call262144, 262144)
   457  CALLFN(·call524288, 524288)
   458  CALLFN(·call1048576, 1048576)
   459  CALLFN(·call2097152, 2097152)
   460  CALLFN(·call4194304, 4194304)
   461  CALLFN(·call8388608, 8388608)
   462  CALLFN(·call16777216, 16777216)
   463  CALLFN(·call33554432, 33554432)
   464  CALLFN(·call67108864, 67108864)
   465  CALLFN(·call134217728, 134217728)
   466  CALLFN(·call268435456, 268435456)
   467  CALLFN(·call536870912, 536870912)
   468  CALLFN(·call1073741824, 1073741824)
   469  
   470  // bool cas(int32 *val, int32 old, int32 new)
   471  // Atomically:
   472  //	if(*val == old){
   473  //		*val = new;
   474  //		return 1;
   475  //	}else
   476  //		return 0;
   477  TEXT runtime·cas(SB), NOSPLIT, $0-13
   478  	MOVL	ptr+0(FP), BX
   479  	MOVL	old+4(FP), AX
   480  	MOVL	new+8(FP), CX
   481  	LOCK
   482  	CMPXCHGL	CX, 0(BX)
   483  	SETEQ	ret+12(FP)
   484  	RET
   485  
   486  TEXT runtime·casuintptr(SB), NOSPLIT, $0-13
   487  	JMP	runtime·cas(SB)
   488  
   489  TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-8
   490  	JMP	runtime·atomicload(SB)
   491  
   492  TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-8
   493  	JMP	runtime·atomicload(SB)
   494  
   495  TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-8
   496  	JMP	runtime·atomicstore(SB)
   497  
   498  // bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
   499  // Atomically:
   500  //	if(*val == *old){
   501  //		*val = new;
   502  //		return 1;
   503  //	} else {
   504  //		return 0;
   505  //	}
   506  TEXT runtime·cas64(SB), NOSPLIT, $0-21
   507  	MOVL	ptr+0(FP), BP
   508  	MOVL	old_lo+4(FP), AX
   509  	MOVL	old_hi+8(FP), DX
   510  	MOVL	new_lo+12(FP), BX
   511  	MOVL	new_hi+16(FP), CX
   512  	LOCK
   513  	CMPXCHG8B	0(BP)
   514  	SETEQ	ret+20(FP)
   515  	RET
   516  
   517  // bool casp(void **p, void *old, void *new)
   518  // Atomically:
   519  //	if(*p == old){
   520  //		*p = new;
   521  //		return 1;
   522  //	}else
   523  //		return 0;
   524  TEXT runtime·casp1(SB), NOSPLIT, $0-13
   525  	MOVL	ptr+0(FP), BX
   526  	MOVL	old+4(FP), AX
   527  	MOVL	new+8(FP), CX
   528  	LOCK
   529  	CMPXCHGL	CX, 0(BX)
   530  	SETEQ	ret+12(FP)
   531  	RET
   532  
   533  // uint32 xadd(uint32 volatile *val, int32 delta)
   534  // Atomically:
   535  //	*val += delta;
   536  //	return *val;
   537  TEXT runtime·xadd(SB), NOSPLIT, $0-12
   538  	MOVL	ptr+0(FP), BX
   539  	MOVL	delta+4(FP), AX
   540  	MOVL	AX, CX
   541  	LOCK
   542  	XADDL	AX, 0(BX)
   543  	ADDL	CX, AX
   544  	MOVL	AX, ret+8(FP)
   545  	RET
   546  
   547  TEXT runtime·xchg(SB), NOSPLIT, $0-12
   548  	MOVL	ptr+0(FP), BX
   549  	MOVL	new+4(FP), AX
   550  	XCHGL	AX, 0(BX)
   551  	MOVL	AX, ret+8(FP)
   552  	RET
   553  
   554  TEXT runtime·xchguintptr(SB), NOSPLIT, $0-12
   555  	JMP	runtime·xchg(SB)
   556  
   557  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   558  	MOVL	cycles+0(FP), AX
   559  again:
   560  	PAUSE
   561  	SUBL	$1, AX
   562  	JNZ	again
   563  	RET
   564  
   565  TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8
   566  	MOVL	ptr+0(FP), BX
   567  	MOVL	val+4(FP), AX
   568  	XCHGL	AX, 0(BX)
   569  	RET
   570  
   571  TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
   572  	MOVL	ptr+0(FP), BX
   573  	MOVL	val+4(FP), AX
   574  	XCHGL	AX, 0(BX)
   575  	RET
   576  
   577  // uint64 atomicload64(uint64 volatile* addr);
   578  TEXT runtime·atomicload64(SB), NOSPLIT, $0-12
   579  	MOVL	ptr+0(FP), AX
   580  	TESTL	$7, AX
   581  	JZ	2(PC)
   582  	MOVL	0, AX // crash with nil ptr deref
   583  	LEAL	ret_lo+4(FP), BX
   584  	// MOVQ (%EAX), %MM0
   585  	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
   586  	// MOVQ %MM0, 0(%EBX)
   587  	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
   588  	// EMMS
   589  	BYTE $0x0F; BYTE $0x77
   590  	RET
   591  
   592  // void runtime·atomicstore64(uint64 volatile* addr, uint64 v);
   593  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12
   594  	MOVL	ptr+0(FP), AX
   595  	TESTL	$7, AX
   596  	JZ	2(PC)
   597  	MOVL	0, AX // crash with nil ptr deref
   598  	// MOVQ and EMMS were introduced on the Pentium MMX.
   599  	// MOVQ 0x8(%ESP), %MM0
   600  	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
   601  	// MOVQ %MM0, (%EAX)
   602  	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
   603  	// EMMS
   604  	BYTE $0x0F; BYTE $0x77
   605  	// This is essentially a no-op, but it provides required memory fencing.
   606  	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
   607  	MOVL	$0, AX
   608  	LOCK
   609  	XADDL	AX, (SP)
   610  	RET
   611  
   612  // void	runtime·atomicor8(byte volatile*, byte);
   613  TEXT runtime·atomicor8(SB), NOSPLIT, $0-5
   614  	MOVL	ptr+0(FP), AX
   615  	MOVB	val+4(FP), BX
   616  	LOCK
   617  	ORB	BX, (AX)
   618  	RET
   619  
   620  // void	runtime·atomicand8(byte volatile*, byte);
   621  TEXT runtime·atomicand8(SB), NOSPLIT, $0-5
   622  	MOVL	ptr+0(FP), AX
   623  	MOVB	val+4(FP), BX
   624  	LOCK
   625  	ANDB	BX, (AX)
   626  	RET
   627  
   628  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   629  	// Stores are already ordered on x86, so this is just a
   630  	// compile barrier.
   631  	RET
   632  
   633  // void jmpdefer(fn, sp);
   634  // called from deferreturn.
   635  // 1. pop the caller
   636  // 2. sub 5 bytes from the callers return
   637  // 3. jmp to the argument
   638  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   639  	MOVL	fv+0(FP), DX	// fn
   640  	MOVL	argp+4(FP), BX	// caller sp
   641  	LEAL	-4(BX), SP	// caller sp after CALL
   642  	SUBL	$5, (SP)	// return to CALL again
   643  	MOVL	0(DX), BX
   644  	JMP	BX	// but first run the deferred function
   645  
   646  // Save state of caller into g->sched.
   647  TEXT gosave<>(SB),NOSPLIT,$0
   648  	PUSHL	AX
   649  	PUSHL	BX
   650  	get_tls(BX)
   651  	MOVL	g(BX), BX
   652  	LEAL	arg+0(FP), AX
   653  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   654  	MOVL	-4(AX), AX
   655  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   656  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   657  	MOVL	$0, (g_sched+gobuf_ctxt)(BX)
   658  	POPL	BX
   659  	POPL	AX
   660  	RET
   661  
   662  // func asmcgocall(fn, arg unsafe.Pointer) int32
   663  // Call fn(arg) on the scheduler stack,
   664  // aligned appropriately for the gcc ABI.
   665  // See cgocall.go for more details.
   666  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   667  	MOVL	fn+0(FP), AX
   668  	MOVL	arg+4(FP), BX
   669  
   670  	MOVL	SP, DX
   671  
   672  	// Figure out if we need to switch to m->g0 stack.
   673  	// We get called to create new OS threads too, and those
   674  	// come in on the m->g0 stack already.
   675  	get_tls(CX)
   676  	MOVL	g(CX), BP
   677  	MOVL	g_m(BP), BP
   678  	MOVL	m_g0(BP), SI
   679  	MOVL	g(CX), DI
   680  	CMPL	SI, DI
   681  	JEQ	4(PC)
   682  	CALL	gosave<>(SB)
   683  	MOVL	SI, g(CX)
   684  	MOVL	(g_sched+gobuf_sp)(SI), SP
   685  
   686  	// Now on a scheduling stack (a pthread-created stack).
   687  	SUBL	$32, SP
   688  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   689  	MOVL	DI, 8(SP)	// save g
   690  	MOVL	(g_stack+stack_hi)(DI), DI
   691  	SUBL	DX, DI
   692  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   693  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   694  	CALL	AX
   695  
   696  	// Restore registers, g, stack pointer.
   697  	get_tls(CX)
   698  	MOVL	8(SP), DI
   699  	MOVL	(g_stack+stack_hi)(DI), SI
   700  	SUBL	4(SP), SI
   701  	MOVL	DI, g(CX)
   702  	MOVL	SI, SP
   703  
   704  	MOVL	AX, ret+8(FP)
   705  	RET
   706  
   707  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   708  // Turn the fn into a Go func (by taking its address) and call
   709  // cgocallback_gofunc.
   710  TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
   711  	LEAL	fn+0(FP), AX
   712  	MOVL	AX, 0(SP)
   713  	MOVL	frame+4(FP), AX
   714  	MOVL	AX, 4(SP)
   715  	MOVL	framesize+8(FP), AX
   716  	MOVL	AX, 8(SP)
   717  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   718  	CALL	AX
   719  	RET
   720  
   721  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   722  // See cgocall.go for more details.
   723  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-12
   724  	NO_LOCAL_POINTERS
   725  
   726  	// If g is nil, Go did not create the current thread.
   727  	// Call needm to obtain one for temporary use.
   728  	// In this case, we're running on the thread stack, so there's
   729  	// lots of space, but the linker doesn't know. Hide the call from
   730  	// the linker analysis by using an indirect call through AX.
   731  	get_tls(CX)
   732  #ifdef GOOS_windows
   733  	MOVL	$0, BP
   734  	CMPL	CX, $0
   735  	JEQ	2(PC) // TODO
   736  #endif
   737  	MOVL	g(CX), BP
   738  	CMPL	BP, $0
   739  	JEQ	needm
   740  	MOVL	g_m(BP), BP
   741  	MOVL	BP, DX // saved copy of oldm
   742  	JMP	havem
   743  needm:
   744  	MOVL	$0, 0(SP)
   745  	MOVL	$runtime·needm(SB), AX
   746  	CALL	AX
   747  	MOVL	0(SP), DX
   748  	get_tls(CX)
   749  	MOVL	g(CX), BP
   750  	MOVL	g_m(BP), BP
   751  
   752  	// Set m->sched.sp = SP, so that if a panic happens
   753  	// during the function we are about to execute, it will
   754  	// have a valid SP to run on the g0 stack.
   755  	// The next few lines (after the havem label)
   756  	// will save this SP onto the stack and then write
   757  	// the same SP back to m->sched.sp. That seems redundant,
   758  	// but if an unrecovered panic happens, unwindm will
   759  	// restore the g->sched.sp from the stack location
   760  	// and then systemstack will try to use it. If we don't set it here,
   761  	// that restored SP will be uninitialized (typically 0) and
   762  	// will not be usable.
   763  	MOVL	m_g0(BP), SI
   764  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   765  
   766  havem:
   767  	// Now there's a valid m, and we're running on its m->g0.
   768  	// Save current m->g0->sched.sp on stack and then set it to SP.
   769  	// Save current sp in m->g0->sched.sp in preparation for
   770  	// switch back to m->curg stack.
   771  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   772  	MOVL	m_g0(BP), SI
   773  	MOVL	(g_sched+gobuf_sp)(SI), AX
   774  	MOVL	AX, 0(SP)
   775  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   776  
   777  	// Switch to m->curg stack and call runtime.cgocallbackg.
   778  	// Because we are taking over the execution of m->curg
   779  	// but *not* resuming what had been running, we need to
   780  	// save that information (m->curg->sched) so we can restore it.
   781  	// We can restore m->curg->sched.sp easily, because calling
   782  	// runtime.cgocallbackg leaves SP unchanged upon return.
   783  	// To save m->curg->sched.pc, we push it onto the stack.
   784  	// This has the added benefit that it looks to the traceback
   785  	// routine like cgocallbackg is going to return to that
   786  	// PC (because the frame we allocate below has the same
   787  	// size as cgocallback_gofunc's frame declared above)
   788  	// so that the traceback will seamlessly trace back into
   789  	// the earlier calls.
   790  	//
   791  	// In the new goroutine, 0(SP) holds the saved oldm (DX) register.
   792  	// 4(SP) and 8(SP) are unused.
   793  	MOVL	m_curg(BP), SI
   794  	MOVL	SI, g(CX)
   795  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   796  	MOVL	(g_sched+gobuf_pc)(SI), BP
   797  	MOVL	BP, -4(DI)
   798  	LEAL	-(4+12)(DI), SP
   799  	MOVL	DX, 0(SP)
   800  	CALL	runtime·cgocallbackg(SB)
   801  	MOVL	0(SP), DX
   802  
   803  	// Restore g->sched (== m->curg->sched) from saved values.
   804  	get_tls(CX)
   805  	MOVL	g(CX), SI
   806  	MOVL	12(SP), BP
   807  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   808  	LEAL	(12+4)(SP), DI
   809  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   810  
   811  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   812  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   813  	// so we do not have to restore it.)
   814  	MOVL	g(CX), BP
   815  	MOVL	g_m(BP), BP
   816  	MOVL	m_g0(BP), SI
   817  	MOVL	SI, g(CX)
   818  	MOVL	(g_sched+gobuf_sp)(SI), SP
   819  	MOVL	0(SP), AX
   820  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   821  	
   822  	// If the m on entry was nil, we called needm above to borrow an m
   823  	// for the duration of the call. Since the call is over, return it with dropm.
   824  	CMPL	DX, $0
   825  	JNE 3(PC)
   826  	MOVL	$runtime·dropm(SB), AX
   827  	CALL	AX
   828  
   829  	// Done!
   830  	RET
   831  
   832  // void setg(G*); set g. for use by needm.
   833  TEXT runtime·setg(SB), NOSPLIT, $0-4
   834  	MOVL	gg+0(FP), BX
   835  #ifdef GOOS_windows
   836  	CMPL	BX, $0
   837  	JNE	settls
   838  	MOVL	$0, 0x14(FS)
   839  	RET
   840  settls:
   841  	MOVL	g_m(BX), AX
   842  	LEAL	m_tls(AX), AX
   843  	MOVL	AX, 0x14(FS)
   844  #endif
   845  	get_tls(CX)
   846  	MOVL	BX, g(CX)
   847  	RET
   848  
   849  // void setg_gcc(G*); set g. for use by gcc
   850  TEXT setg_gcc<>(SB), NOSPLIT, $0
   851  	get_tls(AX)
   852  	MOVL	gg+0(FP), DX
   853  	MOVL	DX, g(AX)
   854  	RET
   855  
   856  // check that SP is in range [g->stack.lo, g->stack.hi)
   857  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   858  	get_tls(CX)
   859  	MOVL	g(CX), AX
   860  	CMPL	(g_stack+stack_hi)(AX), SP
   861  	JHI	2(PC)
   862  	INT	$3
   863  	CMPL	SP, (g_stack+stack_lo)(AX)
   864  	JHI	2(PC)
   865  	INT	$3
   866  	RET
   867  
   868  TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
   869  	MOVL	argp+0(FP),AX		// addr of first arg
   870  	MOVL	-4(AX),AX		// get calling pc
   871  	CMPL	AX, runtime·stackBarrierPC(SB)
   872  	JNE	nobar
   873  	// Get original return PC.
   874  	CALL	runtime·nextBarrierPC(SB)
   875  	MOVL	0(SP), AX
   876  nobar:
   877  	MOVL	AX, ret+4(FP)
   878  	RET
   879  
   880  TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
   881  	MOVL	argp+0(FP),AX		// addr of first arg
   882  	MOVL	pc+4(FP), BX
   883  	MOVL	-4(AX), CX
   884  	CMPL	CX, runtime·stackBarrierPC(SB)
   885  	JEQ	setbar
   886  	MOVL	BX, -4(AX)		// set calling pc
   887  	RET
   888  setbar:
   889  	// Set the stack barrier return PC.
   890  	MOVL	BX, 0(SP)
   891  	CALL	runtime·setNextBarrierPC(SB)
   892  	RET
   893  
   894  TEXT runtime·getcallersp(SB), NOSPLIT, $0-8
   895  	MOVL	argp+0(FP), AX
   896  	MOVL	AX, ret+4(FP)
   897  	RET
   898  
   899  // func cputicks() int64
   900  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   901  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence
   902  	JEQ	done
   903  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   904  	JNE	mfence
   905  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   906  	JMP	done
   907  mfence:
   908  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   909  done:
   910  	RDTSC
   911  	MOVL	AX, ret_lo+0(FP)
   912  	MOVL	DX, ret_hi+4(FP)
   913  	RET
   914  
   915  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   916  	// set up ldt 7 to point at tls0
   917  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   918  	// the entry number is just a hint.  setldt will set up GS with what it used.
   919  	MOVL	$7, 0(SP)
   920  	LEAL	runtime·tls0(SB), AX
   921  	MOVL	AX, 4(SP)
   922  	MOVL	$32, 8(SP)	// sizeof(tls array)
   923  	CALL	runtime·setldt(SB)
   924  	RET
   925  
   926  TEXT runtime·emptyfunc(SB),0,$0-0
   927  	RET
   928  
   929  TEXT runtime·abort(SB),NOSPLIT,$0-0
   930  	INT $0x3
   931  
   932  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   933  // redirects to memhash(p, h, size) using the size
   934  // stored in the closure.
   935  TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
   936  	GO_ARGS
   937  	NO_LOCAL_POINTERS
   938  	MOVL	p+0(FP), AX
   939  	MOVL	h+4(FP), BX
   940  	MOVL	4(DX), CX
   941  	MOVL	AX, 0(SP)
   942  	MOVL	BX, 4(SP)
   943  	MOVL	CX, 8(SP)
   944  	CALL	runtime·memhash(SB)
   945  	MOVL	12(SP), AX
   946  	MOVL	AX, ret+8(FP)
   947  	RET
   948  
   949  // hash function using AES hardware instructions
   950  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   951  	MOVL	p+0(FP), AX	// ptr to data
   952  	MOVL	s+8(FP), CX	// size
   953  	LEAL	ret+12(FP), DX
   954  	JMP	runtime·aeshashbody(SB)
   955  
   956  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   957  	MOVL	p+0(FP), AX	// ptr to string object
   958  	MOVL	4(AX), CX	// length of string
   959  	MOVL	(AX), AX	// string data
   960  	LEAL	ret+8(FP), DX
   961  	JMP	runtime·aeshashbody(SB)
   962  
   963  // AX: data
   964  // CX: length
   965  // DX: address to put return value
   966  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   967  	MOVL	h+4(FP), X6	// seed to low 64 bits of xmm6
   968  	PINSRD	$2, CX, X6	// size to high 64 bits of xmm6
   969  	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
   970  	MOVO	runtime·aeskeysched(SB), X7
   971  	CMPL	CX, $16
   972  	JB	aes0to15
   973  	JE	aes16
   974  	CMPL	CX, $32
   975  	JBE	aes17to32
   976  	CMPL	CX, $64
   977  	JBE	aes33to64
   978  	JMP	aes65plus
   979  	
   980  aes0to15:
   981  	TESTL	CX, CX
   982  	JE	aes0
   983  
   984  	ADDL	$16, AX
   985  	TESTW	$0xff0, AX
   986  	JE	endofpage
   987  
   988  	// 16 bytes loaded at this address won't cross
   989  	// a page boundary, so we can load it directly.
   990  	MOVOU	-16(AX), X0
   991  	ADDL	CX, CX
   992  	PAND	masks<>(SB)(CX*8), X0
   993  
   994  	// scramble 3 times
   995  	AESENC	X6, X0
   996  	AESENC	X7, X0
   997  	AESENC	X7, X0
   998  	MOVL	X0, (DX)
   999  	RET
  1000  
  1001  endofpage:
  1002  	// address ends in 1111xxxx.  Might be up against
  1003  	// a page boundary, so load ending at last byte.
  1004  	// Then shift bytes down using pshufb.
  1005  	MOVOU	-32(AX)(CX*1), X0
  1006  	ADDL	CX, CX
  1007  	PSHUFB	shifts<>(SB)(CX*8), X0
  1008  	AESENC	X6, X0
  1009  	AESENC	X7, X0
  1010  	AESENC	X7, X0
  1011  	MOVL	X0, (DX)
  1012  	RET
  1013  
  1014  aes0:
  1015  	// return input seed
  1016  	MOVL	h+4(FP), AX
  1017  	MOVL	AX, (DX)
  1018  	RET
  1019  
  1020  aes16:
  1021  	MOVOU	(AX), X0
  1022  	AESENC	X6, X0
  1023  	AESENC	X7, X0
  1024  	AESENC	X7, X0
  1025  	MOVL	X0, (DX)
  1026  	RET
  1027  
  1028  
  1029  aes17to32:
  1030  	// load data to be hashed
  1031  	MOVOU	(AX), X0
  1032  	MOVOU	-16(AX)(CX*1), X1
  1033  
  1034  	// scramble 3 times
  1035  	AESENC	X6, X0
  1036  	AESENC	runtime·aeskeysched+16(SB), X1
  1037  	AESENC	X7, X0
  1038  	AESENC	X7, X1
  1039  	AESENC	X7, X0
  1040  	AESENC	X7, X1
  1041  
  1042  	// combine results
  1043  	PXOR	X1, X0
  1044  	MOVL	X0, (DX)
  1045  	RET
  1046  
  1047  aes33to64:
  1048  	MOVOU	(AX), X0
  1049  	MOVOU	16(AX), X1
  1050  	MOVOU	-32(AX)(CX*1), X2
  1051  	MOVOU	-16(AX)(CX*1), X3
  1052  	
  1053  	AESENC	X6, X0
  1054  	AESENC	runtime·aeskeysched+16(SB), X1
  1055  	AESENC	runtime·aeskeysched+32(SB), X2
  1056  	AESENC	runtime·aeskeysched+48(SB), X3
  1057  	AESENC	X7, X0
  1058  	AESENC	X7, X1
  1059  	AESENC	X7, X2
  1060  	AESENC	X7, X3
  1061  	AESENC	X7, X0
  1062  	AESENC	X7, X1
  1063  	AESENC	X7, X2
  1064  	AESENC	X7, X3
  1065  
  1066  	PXOR	X2, X0
  1067  	PXOR	X3, X1
  1068  	PXOR	X1, X0
  1069  	MOVL	X0, (DX)
  1070  	RET
  1071  
  1072  aes65plus:
  1073  	// start with last (possibly overlapping) block
  1074  	MOVOU	-64(AX)(CX*1), X0
  1075  	MOVOU	-48(AX)(CX*1), X1
  1076  	MOVOU	-32(AX)(CX*1), X2
  1077  	MOVOU	-16(AX)(CX*1), X3
  1078  
  1079  	// scramble state once
  1080  	AESENC	X6, X0
  1081  	AESENC	runtime·aeskeysched+16(SB), X1
  1082  	AESENC	runtime·aeskeysched+32(SB), X2
  1083  	AESENC	runtime·aeskeysched+48(SB), X3
  1084  
  1085  	// compute number of remaining 64-byte blocks
  1086  	DECL	CX
  1087  	SHRL	$6, CX
  1088  	
  1089  aesloop:
  1090  	// scramble state, xor in a block
  1091  	MOVOU	(AX), X4
  1092  	MOVOU	16(AX), X5
  1093  	AESENC	X4, X0
  1094  	AESENC	X5, X1
  1095  	MOVOU	32(AX), X4
  1096  	MOVOU	48(AX), X5
  1097  	AESENC	X4, X2
  1098  	AESENC	X5, X3
  1099  
  1100  	// scramble state
  1101  	AESENC	X7, X0
  1102  	AESENC	X7, X1
  1103  	AESENC	X7, X2
  1104  	AESENC	X7, X3
  1105  
  1106  	ADDL	$64, AX
  1107  	DECL	CX
  1108  	JNE	aesloop
  1109  
  1110  	// 2 more scrambles to finish
  1111  	AESENC	X7, X0
  1112  	AESENC	X7, X1
  1113  	AESENC	X7, X2
  1114  	AESENC	X7, X3
  1115  	AESENC	X7, X0
  1116  	AESENC	X7, X1
  1117  	AESENC	X7, X2
  1118  	AESENC	X7, X3
  1119  
  1120  	PXOR	X2, X0
  1121  	PXOR	X3, X1
  1122  	PXOR	X1, X0
  1123  	MOVL	X0, (DX)
  1124  	RET
  1125  
  1126  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1127  	MOVL	p+0(FP), AX	// ptr to data
  1128  	MOVL	h+4(FP), X0	// seed
  1129  	PINSRD	$1, (AX), X0	// data
  1130  	AESENC	runtime·aeskeysched+0(SB), X0
  1131  	AESENC	runtime·aeskeysched+16(SB), X0
  1132  	AESENC	runtime·aeskeysched+32(SB), X0
  1133  	MOVL	X0, ret+8(FP)
  1134  	RET
  1135  
  1136  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1137  	MOVL	p+0(FP), AX	// ptr to data
  1138  	MOVQ	(AX), X0	// data
  1139  	PINSRD	$2, h+4(FP), X0	// seed
  1140  	AESENC	runtime·aeskeysched+0(SB), X0
  1141  	AESENC	runtime·aeskeysched+16(SB), X0
  1142  	AESENC	runtime·aeskeysched+32(SB), X0
  1143  	MOVL	X0, ret+8(FP)
  1144  	RET
  1145  
  1146  // simple mask to get rid of data in the high part of the register.
  1147  DATA masks<>+0x00(SB)/4, $0x00000000
  1148  DATA masks<>+0x04(SB)/4, $0x00000000
  1149  DATA masks<>+0x08(SB)/4, $0x00000000
  1150  DATA masks<>+0x0c(SB)/4, $0x00000000
  1151  	
  1152  DATA masks<>+0x10(SB)/4, $0x000000ff
  1153  DATA masks<>+0x14(SB)/4, $0x00000000
  1154  DATA masks<>+0x18(SB)/4, $0x00000000
  1155  DATA masks<>+0x1c(SB)/4, $0x00000000
  1156  	
  1157  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1158  DATA masks<>+0x24(SB)/4, $0x00000000
  1159  DATA masks<>+0x28(SB)/4, $0x00000000
  1160  DATA masks<>+0x2c(SB)/4, $0x00000000
  1161  	
  1162  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1163  DATA masks<>+0x34(SB)/4, $0x00000000
  1164  DATA masks<>+0x38(SB)/4, $0x00000000
  1165  DATA masks<>+0x3c(SB)/4, $0x00000000
  1166  	
  1167  DATA masks<>+0x40(SB)/4, $0xffffffff
  1168  DATA masks<>+0x44(SB)/4, $0x00000000
  1169  DATA masks<>+0x48(SB)/4, $0x00000000
  1170  DATA masks<>+0x4c(SB)/4, $0x00000000
  1171  	
  1172  DATA masks<>+0x50(SB)/4, $0xffffffff
  1173  DATA masks<>+0x54(SB)/4, $0x000000ff
  1174  DATA masks<>+0x58(SB)/4, $0x00000000
  1175  DATA masks<>+0x5c(SB)/4, $0x00000000
  1176  	
  1177  DATA masks<>+0x60(SB)/4, $0xffffffff
  1178  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1179  DATA masks<>+0x68(SB)/4, $0x00000000
  1180  DATA masks<>+0x6c(SB)/4, $0x00000000
  1181  	
  1182  DATA masks<>+0x70(SB)/4, $0xffffffff
  1183  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1184  DATA masks<>+0x78(SB)/4, $0x00000000
  1185  DATA masks<>+0x7c(SB)/4, $0x00000000
  1186  	
  1187  DATA masks<>+0x80(SB)/4, $0xffffffff
  1188  DATA masks<>+0x84(SB)/4, $0xffffffff
  1189  DATA masks<>+0x88(SB)/4, $0x00000000
  1190  DATA masks<>+0x8c(SB)/4, $0x00000000
  1191  	
  1192  DATA masks<>+0x90(SB)/4, $0xffffffff
  1193  DATA masks<>+0x94(SB)/4, $0xffffffff
  1194  DATA masks<>+0x98(SB)/4, $0x000000ff
  1195  DATA masks<>+0x9c(SB)/4, $0x00000000
  1196  	
  1197  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1198  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1199  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1200  DATA masks<>+0xac(SB)/4, $0x00000000
  1201  	
  1202  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1203  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1204  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1205  DATA masks<>+0xbc(SB)/4, $0x00000000
  1206  	
  1207  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1208  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1209  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1210  DATA masks<>+0xcc(SB)/4, $0x00000000
  1211  	
  1212  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1213  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1214  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1215  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1216  	
  1217  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1218  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1219  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1220  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1221  	
  1222  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1223  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1224  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1225  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1226  
  1227  GLOBL masks<>(SB),RODATA,$256
  1228  
  1229  // these are arguments to pshufb.  They move data down from
  1230  // the high bytes of the register to the low bytes of the register.
  1231  // index is how many bytes to move.
  1232  DATA shifts<>+0x00(SB)/4, $0x00000000
  1233  DATA shifts<>+0x04(SB)/4, $0x00000000
  1234  DATA shifts<>+0x08(SB)/4, $0x00000000
  1235  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1236  	
  1237  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1238  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1239  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1240  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1241  	
  1242  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1243  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1244  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1245  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1246  	
  1247  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1248  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1249  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1250  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1251  	
  1252  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1253  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1254  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1255  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1256  	
  1257  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1258  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1259  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1260  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1261  	
  1262  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1263  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1264  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1265  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1266  	
  1267  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1268  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1269  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1270  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1271  	
  1272  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1273  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1274  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1275  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1276  	
  1277  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1278  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1279  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1280  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1281  	
  1282  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1283  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1284  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1285  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1286  	
  1287  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1288  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1289  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1290  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1291  	
  1292  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1293  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1294  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1295  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1296  	
  1297  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1298  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1299  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1300  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1301  	
  1302  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1303  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1304  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1305  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1306  	
  1307  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1308  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1309  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1310  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1311  
  1312  GLOBL shifts<>(SB),RODATA,$256
  1313  
  1314  TEXT runtime·memeq(SB),NOSPLIT,$0-13
  1315  	MOVL	a+0(FP), SI
  1316  	MOVL	b+4(FP), DI
  1317  	MOVL	size+8(FP), BX
  1318  	LEAL	ret+12(FP), AX
  1319  	JMP	runtime·memeqbody(SB)
  1320  
  1321  // memequal_varlen(a, b unsafe.Pointer) bool
  1322  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1323  	MOVL    a+0(FP), SI
  1324  	MOVL    b+4(FP), DI
  1325  	CMPL    SI, DI
  1326  	JEQ     eq
  1327  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1328  	LEAL	ret+8(FP), AX
  1329  	JMP	runtime·memeqbody(SB)
  1330  eq:
  1331  	MOVB    $1, ret+8(FP)
  1332  	RET
  1333  
  1334  // eqstring tests whether two strings are equal.
  1335  // The compiler guarantees that strings passed
  1336  // to eqstring have equal length.
  1337  // See runtime_test.go:eqstring_generic for
  1338  // equivalent Go code.
  1339  TEXT runtime·eqstring(SB),NOSPLIT,$0-17
  1340  	MOVL	s1str+0(FP), SI
  1341  	MOVL	s2str+8(FP), DI
  1342  	CMPL	SI, DI
  1343  	JEQ	same
  1344  	MOVL	s1len+4(FP), BX
  1345  	LEAL	v+16(FP), AX
  1346  	JMP	runtime·memeqbody(SB)
  1347  same:
  1348  	MOVB	$1, v+16(FP)
  1349  	RET
  1350  
  1351  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1352  	MOVL	a_len+4(FP), BX
  1353  	MOVL	b_len+16(FP), CX
  1354  	CMPL	BX, CX
  1355  	JNE	eqret
  1356  	MOVL	a+0(FP), SI
  1357  	MOVL	b+12(FP), DI
  1358  	LEAL	ret+24(FP), AX
  1359  	JMP	runtime·memeqbody(SB)
  1360  eqret:
  1361  	MOVB	$0, ret+24(FP)
  1362  	RET
  1363  
  1364  // a in SI
  1365  // b in DI
  1366  // count in BX
  1367  // address of result byte in AX
  1368  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1369  	CMPL	BX, $4
  1370  	JB	small
  1371  
  1372  	// 64 bytes at a time using xmm registers
  1373  hugeloop:
  1374  	CMPL	BX, $64
  1375  	JB	bigloop
  1376  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1377  	JE	bigloop
  1378  	MOVOU	(SI), X0
  1379  	MOVOU	(DI), X1
  1380  	MOVOU	16(SI), X2
  1381  	MOVOU	16(DI), X3
  1382  	MOVOU	32(SI), X4
  1383  	MOVOU	32(DI), X5
  1384  	MOVOU	48(SI), X6
  1385  	MOVOU	48(DI), X7
  1386  	PCMPEQB	X1, X0
  1387  	PCMPEQB	X3, X2
  1388  	PCMPEQB	X5, X4
  1389  	PCMPEQB	X7, X6
  1390  	PAND	X2, X0
  1391  	PAND	X6, X4
  1392  	PAND	X4, X0
  1393  	PMOVMSKB X0, DX
  1394  	ADDL	$64, SI
  1395  	ADDL	$64, DI
  1396  	SUBL	$64, BX
  1397  	CMPL	DX, $0xffff
  1398  	JEQ	hugeloop
  1399  	MOVB	$0, (AX)
  1400  	RET
  1401  
  1402  	// 4 bytes at a time using 32-bit register
  1403  bigloop:
  1404  	CMPL	BX, $4
  1405  	JBE	leftover
  1406  	MOVL	(SI), CX
  1407  	MOVL	(DI), DX
  1408  	ADDL	$4, SI
  1409  	ADDL	$4, DI
  1410  	SUBL	$4, BX
  1411  	CMPL	CX, DX
  1412  	JEQ	bigloop
  1413  	MOVB	$0, (AX)
  1414  	RET
  1415  
  1416  	// remaining 0-4 bytes
  1417  leftover:
  1418  	MOVL	-4(SI)(BX*1), CX
  1419  	MOVL	-4(DI)(BX*1), DX
  1420  	CMPL	CX, DX
  1421  	SETEQ	(AX)
  1422  	RET
  1423  
  1424  small:
  1425  	CMPL	BX, $0
  1426  	JEQ	equal
  1427  
  1428  	LEAL	0(BX*8), CX
  1429  	NEGL	CX
  1430  
  1431  	MOVL	SI, DX
  1432  	CMPB	DX, $0xfc
  1433  	JA	si_high
  1434  
  1435  	// load at SI won't cross a page boundary.
  1436  	MOVL	(SI), SI
  1437  	JMP	si_finish
  1438  si_high:
  1439  	// address ends in 111111xx.  Load up to bytes we want, move to correct position.
  1440  	MOVL	-4(SI)(BX*1), SI
  1441  	SHRL	CX, SI
  1442  si_finish:
  1443  
  1444  	// same for DI.
  1445  	MOVL	DI, DX
  1446  	CMPB	DX, $0xfc
  1447  	JA	di_high
  1448  	MOVL	(DI), DI
  1449  	JMP	di_finish
  1450  di_high:
  1451  	MOVL	-4(DI)(BX*1), DI
  1452  	SHRL	CX, DI
  1453  di_finish:
  1454  
  1455  	SUBL	SI, DI
  1456  	SHLL	CX, DI
  1457  equal:
  1458  	SETEQ	(AX)
  1459  	RET
  1460  
  1461  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1462  	MOVL	s1_base+0(FP), SI
  1463  	MOVL	s1_len+4(FP), BX
  1464  	MOVL	s2_base+8(FP), DI
  1465  	MOVL	s2_len+12(FP), DX
  1466  	LEAL	ret+16(FP), AX
  1467  	JMP	runtime·cmpbody(SB)
  1468  
  1469  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1470  	MOVL	s1+0(FP), SI
  1471  	MOVL	s1+4(FP), BX
  1472  	MOVL	s2+12(FP), DI
  1473  	MOVL	s2+16(FP), DX
  1474  	LEAL	ret+24(FP), AX
  1475  	JMP	runtime·cmpbody(SB)
  1476  
  1477  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1478  	MOVL	s+0(FP), SI
  1479  	MOVL	s_len+4(FP), CX
  1480  	MOVB	c+12(FP), AL
  1481  	MOVL	SI, DI
  1482  	CLD; REPN; SCASB
  1483  	JZ 3(PC)
  1484  	MOVL	$-1, ret+16(FP)
  1485  	RET
  1486  	SUBL	SI, DI
  1487  	SUBL	$1, DI
  1488  	MOVL	DI, ret+16(FP)
  1489  	RET
  1490  
  1491  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1492  	MOVL	s+0(FP), SI
  1493  	MOVL	s_len+4(FP), CX
  1494  	MOVB	c+8(FP), AL
  1495  	MOVL	SI, DI
  1496  	CLD; REPN; SCASB
  1497  	JZ 3(PC)
  1498  	MOVL	$-1, ret+12(FP)
  1499  	RET
  1500  	SUBL	SI, DI
  1501  	SUBL	$1, DI
  1502  	MOVL	DI, ret+12(FP)
  1503  	RET
  1504  
  1505  // input:
  1506  //   SI = a
  1507  //   DI = b
  1508  //   BX = alen
  1509  //   DX = blen
  1510  //   AX = address of return word (set to 1/0/-1)
  1511  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1512  	MOVL	DX, BP
  1513  	SUBL	BX, DX // DX = blen-alen
  1514  	CMOVLGT	BX, BP // BP = min(alen, blen)
  1515  	CMPL	SI, DI
  1516  	JEQ	allsame
  1517  	CMPL	BP, $4
  1518  	JB	small
  1519  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1520  	JE	mediumloop
  1521  largeloop:
  1522  	CMPL	BP, $16
  1523  	JB	mediumloop
  1524  	MOVOU	(SI), X0
  1525  	MOVOU	(DI), X1
  1526  	PCMPEQB X0, X1
  1527  	PMOVMSKB X1, BX
  1528  	XORL	$0xffff, BX	// convert EQ to NE
  1529  	JNE	diff16	// branch if at least one byte is not equal
  1530  	ADDL	$16, SI
  1531  	ADDL	$16, DI
  1532  	SUBL	$16, BP
  1533  	JMP	largeloop
  1534  
  1535  diff16:
  1536  	BSFL	BX, BX	// index of first byte that differs
  1537  	XORL	DX, DX
  1538  	MOVB	(SI)(BX*1), CX
  1539  	CMPB	CX, (DI)(BX*1)
  1540  	SETHI	DX
  1541  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1542  	MOVL	DX, (AX)
  1543  	RET
  1544  
  1545  mediumloop:
  1546  	CMPL	BP, $4
  1547  	JBE	_0through4
  1548  	MOVL	(SI), BX
  1549  	MOVL	(DI), CX
  1550  	CMPL	BX, CX
  1551  	JNE	diff4
  1552  	ADDL	$4, SI
  1553  	ADDL	$4, DI
  1554  	SUBL	$4, BP
  1555  	JMP	mediumloop
  1556  
  1557  _0through4:
  1558  	MOVL	-4(SI)(BP*1), BX
  1559  	MOVL	-4(DI)(BP*1), CX
  1560  	CMPL	BX, CX
  1561  	JEQ	allsame
  1562  
  1563  diff4:
  1564  	BSWAPL	BX	// reverse order of bytes
  1565  	BSWAPL	CX
  1566  	XORL	BX, CX	// find bit differences
  1567  	BSRL	CX, CX	// index of highest bit difference
  1568  	SHRL	CX, BX	// move a's bit to bottom
  1569  	ANDL	$1, BX	// mask bit
  1570  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1571  	MOVL	BX, (AX)
  1572  	RET
  1573  
  1574  	// 0-3 bytes in common
  1575  small:
  1576  	LEAL	(BP*8), CX
  1577  	NEGL	CX
  1578  	JEQ	allsame
  1579  
  1580  	// load si
  1581  	CMPB	SI, $0xfc
  1582  	JA	si_high
  1583  	MOVL	(SI), SI
  1584  	JMP	si_finish
  1585  si_high:
  1586  	MOVL	-4(SI)(BP*1), SI
  1587  	SHRL	CX, SI
  1588  si_finish:
  1589  	SHLL	CX, SI
  1590  
  1591  	// same for di
  1592  	CMPB	DI, $0xfc
  1593  	JA	di_high
  1594  	MOVL	(DI), DI
  1595  	JMP	di_finish
  1596  di_high:
  1597  	MOVL	-4(DI)(BP*1), DI
  1598  	SHRL	CX, DI
  1599  di_finish:
  1600  	SHLL	CX, DI
  1601  
  1602  	BSWAPL	SI	// reverse order of bytes
  1603  	BSWAPL	DI
  1604  	XORL	SI, DI	// find bit differences
  1605  	JEQ	allsame
  1606  	BSRL	DI, CX	// index of highest bit difference
  1607  	SHRL	CX, SI	// move a's bit to bottom
  1608  	ANDL	$1, SI	// mask bit
  1609  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1610  	MOVL	BX, (AX)
  1611  	RET
  1612  
  1613  	// all the bytes in common are the same, so we just need
  1614  	// to compare the lengths.
  1615  allsame:
  1616  	XORL	BX, BX
  1617  	XORL	CX, CX
  1618  	TESTL	DX, DX
  1619  	SETLT	BX	// 1 if alen > blen
  1620  	SETEQ	CX	// 1 if alen == blen
  1621  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1622  	MOVL	BX, (AX)
  1623  	RET
  1624  
  1625  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1626  	get_tls(CX)
  1627  	MOVL	g(CX), AX
  1628  	MOVL	g_m(AX), AX
  1629  	MOVL	m_fastrand(AX), DX
  1630  	ADDL	DX, DX
  1631  	MOVL	DX, BX
  1632  	XORL	$0x88888eef, DX
  1633  	CMOVLMI	BX, DX
  1634  	MOVL	DX, m_fastrand(AX)
  1635  	MOVL	DX, ret+0(FP)
  1636  	RET
  1637  
  1638  TEXT runtime·return0(SB), NOSPLIT, $0
  1639  	MOVL	$0, AX
  1640  	RET
  1641  
  1642  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1643  // Must obey the gcc calling convention.
  1644  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1645  	get_tls(CX)
  1646  	MOVL	g(CX), AX
  1647  	MOVL	g_m(AX), AX
  1648  	MOVL	m_curg(AX), AX
  1649  	MOVL	(g_stack+stack_hi)(AX), AX
  1650  	RET
  1651  
  1652  // The top-most function running on a goroutine
  1653  // returns to goexit+PCQuantum.
  1654  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1655  	BYTE	$0x90	// NOP
  1656  	CALL	runtime·goexit1(SB)	// does not return
  1657  	// traceback from goexit1 must hit code range of goexit
  1658  	BYTE	$0x90	// NOP
  1659  
  1660  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
  1661  	MOVL	addr+0(FP), AX
  1662  	PREFETCHT0	(AX)
  1663  	RET
  1664  
  1665  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
  1666  	MOVL	addr+0(FP), AX
  1667  	PREFETCHT1	(AX)
  1668  	RET
  1669  
  1670  
  1671  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
  1672  	MOVL	addr+0(FP), AX
  1673  	PREFETCHT2	(AX)
  1674  	RET
  1675  
  1676  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
  1677  	MOVL	addr+0(FP), AX
  1678  	PREFETCHNTA	(AX)
  1679  	RET