github.com/c0deoo1/golang1.5@v0.0.0-20220525150107-c87c805d4593/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	SUBL	$128, SP		// plenty of scratch
    15  	ANDL	$~15, SP
    16  	MOVL	AX, 120(SP)		// save argc, argv away
    17  	MOVL	BX, 124(SP)
    18  
    19  	// set default stack bounds.
    20  	// _cgo_init may update stackguard.
    21  	MOVL	$runtime·g0(SB), BP
    22  	LEAL	(-64*1024+104)(SP), BX
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	BX, g_stackguard1(BP)
    25  	MOVL	BX, (g_stack+stack_lo)(BP)
    26  	MOVL	SP, (g_stack+stack_hi)(BP)
    27  	
    28  	// find out information about the processor we're on
    29  	MOVL	$0, AX
    30  	CPUID
    31  	CMPL	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  
    46  	MOVL	$1, AX
    47  	CPUID
    48  	MOVL	CX, runtime·cpuid_ecx(SB)
    49  	MOVL	DX, runtime·cpuid_edx(SB)
    50  nocpuinfo:	
    51  
    52  	// if there is an _cgo_init, call it to let it
    53  	// initialize and to set up GS.  if not,
    54  	// we set up GS ourselves.
    55  	MOVL	_cgo_init(SB), AX
    56  	TESTL	AX, AX
    57  	JZ	needtls
    58  	MOVL	$setg_gcc<>(SB), BX
    59  	MOVL	BX, 4(SP)
    60  	MOVL	BP, 0(SP)
    61  	CALL	AX
    62  
    63  	// update stackguard after _cgo_init
    64  	MOVL	$runtime·g0(SB), CX
    65  	MOVL	(g_stack+stack_lo)(CX), AX
    66  	ADDL	$const__StackGuard, AX
    67  	MOVL	AX, g_stackguard0(CX)
    68  	MOVL	AX, g_stackguard1(CX)
    69  
    70  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
    71  	CMPL runtime·iswindows(SB), $0
    72  	JEQ ok
    73  needtls:
    74  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
    75  	CMPL	runtime·isplan9(SB), $1
    76  	JEQ	ok
    77  
    78  	// set up %gs
    79  	CALL	runtime·ldt0setup(SB)
    80  
    81  	// store through it, to make sure it works
    82  	get_tls(BX)
    83  	MOVL	$0x123, g(BX)
    84  	MOVL	runtime·tls0(SB), AX
    85  	CMPL	AX, $0x123
    86  	JEQ	ok
    87  	MOVL	AX, 0	// abort
    88  ok:
    89  	// set up m and g "registers"
    90  	get_tls(BX)
    91  	LEAL	runtime·g0(SB), CX
    92  	MOVL	CX, g(BX)
    93  	LEAL	runtime·m0(SB), AX
    94  
    95  	// save m->g0 = g0
    96  	MOVL	CX, m_g0(AX)
    97  	// save g0->m = m0
    98  	MOVL	AX, g_m(CX)
    99  
   100  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   101  
   102  	// convention is D is always cleared
   103  	CLD
   104  
   105  	CALL	runtime·check(SB)
   106  
   107  	// saved argc, argv
   108  	MOVL	120(SP), AX
   109  	MOVL	AX, 0(SP)
   110  	MOVL	124(SP), AX
   111  	MOVL	AX, 4(SP)
   112  	CALL	runtime·args(SB)
   113  	CALL	runtime·osinit(SB)
   114  	CALL	runtime·schedinit(SB)
   115  
   116  	// create a new goroutine to start program
   117  	PUSHL	$runtime·mainPC(SB)	// entry
   118  	PUSHL	$0	// arg size
   119  	CALL	runtime·newproc(SB)
   120  	POPL	AX
   121  	POPL	AX
   122  
   123  	// start this M
   124  	CALL	runtime·mstart(SB)
   125  
   126  	INT $3
   127  	RET
   128  
   129  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   130  GLOBL	runtime·mainPC(SB),RODATA,$4
   131  
   132  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   133  	INT $3
   134  	RET
   135  
   136  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   137  	// Linux and MinGW start the FPU in extended double precision.
   138  	// Other operating systems use double precision.
   139  	// Change to double precision to match them,
   140  	// and to match other hardware that only has double.
   141  	PUSHL $0x27F
   142  	FLDCW	0(SP)
   143  	POPL AX
   144  	RET
   145  
   146  /*
   147   *  go-routine
   148   */
   149  
   150  // void gosave(Gobuf*)
   151  // save state in Gobuf; setjmp
   152  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   153  	MOVL	buf+0(FP), AX		// gobuf
   154  	LEAL	buf+0(FP), BX		// caller's SP
   155  	MOVL	BX, gobuf_sp(AX)
   156  	MOVL	0(SP), BX		// caller's PC
   157  	MOVL	BX, gobuf_pc(AX)
   158  	MOVL	$0, gobuf_ret(AX)
   159  	MOVL	$0, gobuf_ctxt(AX)
   160  	get_tls(CX)
   161  	MOVL	g(CX), BX
   162  	MOVL	BX, gobuf_g(AX)
   163  	RET
   164  
   165  // void gogo(Gobuf*)
   166  // restore state from Gobuf; longjmp
   167  TEXT runtime·gogo(SB), NOSPLIT, $0-4
   168  	MOVL	buf+0(FP), BX		// gobuf
   169  	MOVL	gobuf_g(BX), DX
   170  	MOVL	0(DX), CX		// make sure g != nil
   171  	get_tls(CX)
   172  	MOVL	DX, g(CX)
   173  	MOVL	gobuf_sp(BX), SP	// restore SP
   174  	MOVL	gobuf_ret(BX), AX
   175  	MOVL	gobuf_ctxt(BX), DX
   176  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   177  	MOVL	$0, gobuf_ret(BX)
   178  	MOVL	$0, gobuf_ctxt(BX)
   179  	MOVL	gobuf_pc(BX), BX
   180  	JMP	BX
   181  
   182  // func mcall(fn func(*g))
   183  // Switch to m->g0's stack, call fn(g).
   184  // Fn must never return.  It should gogo(&g->sched)
   185  // to keep running g.
   186  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   187  	MOVL	fn+0(FP), DI
   188  	
   189  	get_tls(CX)
   190  	MOVL	g(CX), AX	// save state in g->sched
   191  	MOVL	0(SP), BX	// caller's PC
   192  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   193  	LEAL	fn+0(FP), BX	// caller's SP
   194  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   195  	MOVL	AX, (g_sched+gobuf_g)(AX)
   196  
   197  	// switch to m->g0 & its stack, call fn
   198  	MOVL	g(CX), BX
   199  	MOVL	g_m(BX), BX
   200  	MOVL	m_g0(BX), SI
   201  	CMPL	SI, AX	// if g == m->g0 call badmcall
   202  	JNE	3(PC)
   203  	MOVL	$runtime·badmcall(SB), AX
   204  	JMP	AX
   205  	MOVL	SI, g(CX)	// g = m->g0
   206  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   207  	PUSHL	AX
   208  	MOVL	DI, DX
   209  	MOVL	0(DI), DI
   210  	CALL	DI
   211  	POPL	AX
   212  	MOVL	$runtime·badmcall2(SB), AX
   213  	JMP	AX
   214  	RET
   215  
   216  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   217  // of the G stack.  We need to distinguish the routine that
   218  // lives at the bottom of the G stack from the one that lives
   219  // at the top of the system stack because the one at the top of
   220  // the system stack terminates the stack walk (see topofstack()).
   221  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   222  	RET
   223  
   224  // func systemstack(fn func())
   225  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   226  	MOVL	fn+0(FP), DI	// DI = fn
   227  	get_tls(CX)
   228  	MOVL	g(CX), AX	// AX = g
   229  	MOVL	g_m(AX), BX	// BX = m
   230  
   231  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   232  	CMPL	AX, DX
   233  	JEQ	noswitch
   234  
   235  	MOVL	m_g0(BX), DX	// DX = g0
   236  	CMPL	AX, DX
   237  	JEQ	noswitch
   238  
   239  	MOVL	m_curg(BX), BP
   240  	CMPL	AX, BP
   241  	JEQ	switch
   242  	
   243  	// Bad: g is not gsignal, not g0, not curg. What is it?
   244  	// Hide call from linker nosplit analysis.
   245  	MOVL	$runtime·badsystemstack(SB), AX
   246  	CALL	AX
   247  
   248  switch:
   249  	// save our state in g->sched.  Pretend to
   250  	// be systemstack_switch if the G stack is scanned.
   251  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   252  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   253  	MOVL	AX, (g_sched+gobuf_g)(AX)
   254  
   255  	// switch to g0
   256  	MOVL	DX, g(CX)
   257  	MOVL	(g_sched+gobuf_sp)(DX), BX
   258  	// make it look like mstart called systemstack on g0, to stop traceback
   259  	SUBL	$4, BX
   260  	MOVL	$runtime·mstart(SB), DX
   261  	MOVL	DX, 0(BX)
   262  	MOVL	BX, SP
   263  
   264  	// call target function
   265  	MOVL	DI, DX
   266  	MOVL	0(DI), DI
   267  	CALL	DI
   268  
   269  	// switch back to g
   270  	get_tls(CX)
   271  	MOVL	g(CX), AX
   272  	MOVL	g_m(AX), BX
   273  	MOVL	m_curg(BX), AX
   274  	MOVL	AX, g(CX)
   275  	MOVL	(g_sched+gobuf_sp)(AX), SP
   276  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   277  	RET
   278  
   279  noswitch:
   280  	// already on system stack, just call directly
   281  	MOVL	DI, DX
   282  	MOVL	0(DI), DI
   283  	CALL	DI
   284  	RET
   285  
   286  /*
   287   * support for morestack
   288   */
   289  
   290  // Called during function prolog when more stack is needed.
   291  //
   292  // The traceback routines see morestack on a g0 as being
   293  // the top of a stack (for example, morestack calling newstack
   294  // calling the scheduler calling newm calling gc), so we must
   295  // record an argument size. For that purpose, it has no arguments.
   296  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   297  	// Cannot grow scheduler stack (m->g0).
   298  	get_tls(CX)
   299  	MOVL	g(CX), BX
   300  	MOVL	g_m(BX), BX
   301  	MOVL	m_g0(BX), SI
   302  	CMPL	g(CX), SI
   303  	JNE	2(PC)
   304  	INT	$3
   305  
   306  	// Cannot grow signal stack.
   307  	MOVL	m_gsignal(BX), SI
   308  	CMPL	g(CX), SI
   309  	JNE	2(PC)
   310  	INT	$3
   311  
   312  	// Called from f.
   313  	// Set m->morebuf to f's caller.
   314  	MOVL	4(SP), DI	// f's caller's PC
   315  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   316  	LEAL	8(SP), CX	// f's caller's SP
   317  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   318  	get_tls(CX)
   319  	MOVL	g(CX), SI
   320  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   321  
   322  	// Set g->sched to context in f.
   323  	MOVL	0(SP), AX	// f's PC
   324  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   325  	MOVL	SI, (g_sched+gobuf_g)(SI)
   326  	LEAL	4(SP), AX	// f's SP
   327  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   328  	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
   329  
   330  	// Call newstack on m->g0's stack.
   331  	MOVL	m_g0(BX), BP
   332  	MOVL	BP, g(CX)
   333  	MOVL	(g_sched+gobuf_sp)(BP), AX
   334  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   335  	MOVL	AX, SP
   336  	CALL	runtime·newstack(SB)
   337  	MOVL	$0, 0x1003	// crash if newstack returns
   338  	RET
   339  
   340  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   341  	MOVL	$0, DX
   342  	JMP runtime·morestack(SB)
   343  
   344  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   345  	// We came here via a RET to an overwritten return PC.
   346  	// AX may be live. Other registers are available.
   347  
   348  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   349  	get_tls(CX)
   350  	MOVL	g(CX), CX
   351  	MOVL	(g_stkbar+slice_array)(CX), DX
   352  	MOVL	g_stkbarPos(CX), BX
   353  	IMULL	$stkbar__size, BX	// Too big for SIB.
   354  	MOVL	stkbar_savedLRVal(DX)(BX*1), BX
   355  	// Record that this stack barrier was hit.
   356  	ADDL	$1, g_stkbarPos(CX)
   357  	// Jump to the original return PC.
   358  	JMP	BX
   359  
   360  // reflectcall: call a function with the given argument list
   361  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   362  // we don't have variable-sized frames, so we use a small number
   363  // of constant-sized-frame functions to encode a few bits of size in the pc.
   364  // Caution: ugly multiline assembly macros in your future!
   365  
   366  #define DISPATCH(NAME,MAXSIZE)		\
   367  	CMPL	CX, $MAXSIZE;		\
   368  	JA	3(PC);			\
   369  	MOVL	$NAME(SB), AX;		\
   370  	JMP	AX
   371  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   372  
   373  TEXT reflect·call(SB), NOSPLIT, $0-0
   374  	JMP	·reflectcall(SB)
   375  
   376  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   377  	MOVL	argsize+12(FP), CX
   378  	DISPATCH(runtime·call16, 16)
   379  	DISPATCH(runtime·call32, 32)
   380  	DISPATCH(runtime·call64, 64)
   381  	DISPATCH(runtime·call128, 128)
   382  	DISPATCH(runtime·call256, 256)
   383  	DISPATCH(runtime·call512, 512)
   384  	DISPATCH(runtime·call1024, 1024)
   385  	DISPATCH(runtime·call2048, 2048)
   386  	DISPATCH(runtime·call4096, 4096)
   387  	DISPATCH(runtime·call8192, 8192)
   388  	DISPATCH(runtime·call16384, 16384)
   389  	DISPATCH(runtime·call32768, 32768)
   390  	DISPATCH(runtime·call65536, 65536)
   391  	DISPATCH(runtime·call131072, 131072)
   392  	DISPATCH(runtime·call262144, 262144)
   393  	DISPATCH(runtime·call524288, 524288)
   394  	DISPATCH(runtime·call1048576, 1048576)
   395  	DISPATCH(runtime·call2097152, 2097152)
   396  	DISPATCH(runtime·call4194304, 4194304)
   397  	DISPATCH(runtime·call8388608, 8388608)
   398  	DISPATCH(runtime·call16777216, 16777216)
   399  	DISPATCH(runtime·call33554432, 33554432)
   400  	DISPATCH(runtime·call67108864, 67108864)
   401  	DISPATCH(runtime·call134217728, 134217728)
   402  	DISPATCH(runtime·call268435456, 268435456)
   403  	DISPATCH(runtime·call536870912, 536870912)
   404  	DISPATCH(runtime·call1073741824, 1073741824)
   405  	MOVL	$runtime·badreflectcall(SB), AX
   406  	JMP	AX
   407  
   408  #define CALLFN(NAME,MAXSIZE)			\
   409  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   410  	NO_LOCAL_POINTERS;			\
   411  	/* copy arguments to stack */		\
   412  	MOVL	argptr+8(FP), SI;		\
   413  	MOVL	argsize+12(FP), CX;		\
   414  	MOVL	SP, DI;				\
   415  	REP;MOVSB;				\
   416  	/* call function */			\
   417  	MOVL	f+4(FP), DX;			\
   418  	MOVL	(DX), AX; 			\
   419  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   420  	CALL	AX;				\
   421  	/* copy return values back */		\
   422  	MOVL	argptr+8(FP), DI;		\
   423  	MOVL	argsize+12(FP), CX;		\
   424  	MOVL	retoffset+16(FP), BX;		\
   425  	MOVL	SP, SI;				\
   426  	ADDL	BX, DI;				\
   427  	ADDL	BX, SI;				\
   428  	SUBL	BX, CX;				\
   429  	REP;MOVSB;				\
   430  	/* execute write barrier updates */	\
   431  	MOVL	argtype+0(FP), DX;		\
   432  	MOVL	argptr+8(FP), DI;		\
   433  	MOVL	argsize+12(FP), CX;		\
   434  	MOVL	retoffset+16(FP), BX;		\
   435  	MOVL	DX, 0(SP);			\
   436  	MOVL	DI, 4(SP);			\
   437  	MOVL	CX, 8(SP);			\
   438  	MOVL	BX, 12(SP);			\
   439  	CALL	runtime·callwritebarrier(SB);	\
   440  	RET
   441  
   442  CALLFN(·call16, 16)
   443  CALLFN(·call32, 32)
   444  CALLFN(·call64, 64)
   445  CALLFN(·call128, 128)
   446  CALLFN(·call256, 256)
   447  CALLFN(·call512, 512)
   448  CALLFN(·call1024, 1024)
   449  CALLFN(·call2048, 2048)
   450  CALLFN(·call4096, 4096)
   451  CALLFN(·call8192, 8192)
   452  CALLFN(·call16384, 16384)
   453  CALLFN(·call32768, 32768)
   454  CALLFN(·call65536, 65536)
   455  CALLFN(·call131072, 131072)
   456  CALLFN(·call262144, 262144)
   457  CALLFN(·call524288, 524288)
   458  CALLFN(·call1048576, 1048576)
   459  CALLFN(·call2097152, 2097152)
   460  CALLFN(·call4194304, 4194304)
   461  CALLFN(·call8388608, 8388608)
   462  CALLFN(·call16777216, 16777216)
   463  CALLFN(·call33554432, 33554432)
   464  CALLFN(·call67108864, 67108864)
   465  CALLFN(·call134217728, 134217728)
   466  CALLFN(·call268435456, 268435456)
   467  CALLFN(·call536870912, 536870912)
   468  CALLFN(·call1073741824, 1073741824)
   469  
   470  // bool cas(int32 *val, int32 old, int32 new)
   471  // Atomically:
   472  //	if(*val == old){
   473  //		*val = new;
   474  //		return 1;
   475  //	}else
   476  //		return 0;
   477  TEXT runtime·cas(SB), NOSPLIT, $0-13
   478  	MOVL	ptr+0(FP), BX
   479  	MOVL	old+4(FP), AX
   480  	MOVL	new+8(FP), CX
   481  	LOCK
   482  	CMPXCHGL	CX, 0(BX)
   483  	SETEQ	ret+12(FP)
   484  	RET
   485  
   486  TEXT runtime·casuintptr(SB), NOSPLIT, $0-13
   487  	JMP	runtime·cas(SB)
   488  
   489  TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-8
   490  	JMP	runtime·atomicload(SB)
   491  
   492  TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-8
   493  	JMP	runtime·atomicload(SB)
   494  
   495  TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-8
   496  	JMP	runtime·atomicstore(SB)
   497  
   498  // bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
   499  // Atomically:
   500  //	if(*val == *old){
   501  //		*val = new;
   502  //		return 1;
   503  //	} else {
   504  //		return 0;
   505  //	}
   506  TEXT runtime·cas64(SB), NOSPLIT, $0-21
   507  	MOVL	ptr+0(FP), BP
   508  	MOVL	old_lo+4(FP), AX
   509  	MOVL	old_hi+8(FP), DX
   510  	MOVL	new_lo+12(FP), BX
   511  	MOVL	new_hi+16(FP), CX
   512  	LOCK
   513  	CMPXCHG8B	0(BP)
   514  	SETEQ	ret+20(FP)
   515  	RET
   516  
   517  // bool casp(void **p, void *old, void *new)
   518  // Atomically:
   519  //	if(*p == old){
   520  //		*p = new;
   521  //		return 1;
   522  //	}else
   523  //		return 0;
   524  TEXT runtime·casp1(SB), NOSPLIT, $0-13
   525  	MOVL	ptr+0(FP), BX
   526  	MOVL	old+4(FP), AX
   527  	MOVL	new+8(FP), CX
   528  	LOCK
   529  	CMPXCHGL	CX, 0(BX)
   530  	SETEQ	ret+12(FP)
   531  	RET
   532  
   533  // uint32 xadd(uint32 volatile *val, int32 delta)
   534  // Atomically:
   535  //	*val += delta;
   536  //	return *val;
   537  TEXT runtime·xadd(SB), NOSPLIT, $0-12
   538  	MOVL	ptr+0(FP), BX
   539  	MOVL	delta+4(FP), AX
   540  	MOVL	AX, CX
   541  	LOCK
   542  	XADDL	AX, 0(BX)
   543  	ADDL	CX, AX
   544  	MOVL	AX, ret+8(FP)
   545  	RET
   546  
   547  TEXT runtime·xchg(SB), NOSPLIT, $0-12
   548  	MOVL	ptr+0(FP), BX
   549  	MOVL	new+4(FP), AX
   550  	XCHGL	AX, 0(BX)
   551  	MOVL	AX, ret+8(FP)
   552  	RET
   553  
   554  TEXT runtime·xchgp1(SB), NOSPLIT, $0-12
   555  	MOVL	ptr+0(FP), BX
   556  	MOVL	new+4(FP), AX
   557  	XCHGL	AX, 0(BX)
   558  	MOVL	AX, ret+8(FP)
   559  	RET
   560  
   561  TEXT runtime·xchguintptr(SB), NOSPLIT, $0-12
   562  	JMP	runtime·xchg(SB)
   563  
   564  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   565  	MOVL	cycles+0(FP), AX
   566  again:
   567  	PAUSE
   568  	SUBL	$1, AX
   569  	JNZ	again
   570  	RET
   571  
   572  TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8
   573  	MOVL	ptr+0(FP), BX
   574  	MOVL	val+4(FP), AX
   575  	XCHGL	AX, 0(BX)
   576  	RET
   577  
   578  TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
   579  	MOVL	ptr+0(FP), BX
   580  	MOVL	val+4(FP), AX
   581  	XCHGL	AX, 0(BX)
   582  	RET
   583  
   584  // uint64 atomicload64(uint64 volatile* addr);
   585  TEXT runtime·atomicload64(SB), NOSPLIT, $0-12
   586  	MOVL	ptr+0(FP), AX
   587  	TESTL	$7, AX
   588  	JZ	2(PC)
   589  	MOVL	0, AX // crash with nil ptr deref
   590  	LEAL	ret_lo+4(FP), BX
   591  	// MOVQ (%EAX), %MM0
   592  	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
   593  	// MOVQ %MM0, 0(%EBX)
   594  	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
   595  	// EMMS
   596  	BYTE $0x0F; BYTE $0x77
   597  	RET
   598  
   599  // void runtime·atomicstore64(uint64 volatile* addr, uint64 v);
   600  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12
   601  	MOVL	ptr+0(FP), AX
   602  	TESTL	$7, AX
   603  	JZ	2(PC)
   604  	MOVL	0, AX // crash with nil ptr deref
   605  	// MOVQ and EMMS were introduced on the Pentium MMX.
   606  	// MOVQ 0x8(%ESP), %MM0
   607  	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
   608  	// MOVQ %MM0, (%EAX)
   609  	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
   610  	// EMMS
   611  	BYTE $0x0F; BYTE $0x77
   612  	// This is essentially a no-op, but it provides required memory fencing.
   613  	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
   614  	MOVL	$0, AX
   615  	LOCK
   616  	XADDL	AX, (SP)
   617  	RET
   618  
   619  // void	runtime·atomicor8(byte volatile*, byte);
   620  TEXT runtime·atomicor8(SB), NOSPLIT, $0-5
   621  	MOVL	ptr+0(FP), AX
   622  	MOVB	val+4(FP), BX
   623  	LOCK
   624  	ORB	BX, (AX)
   625  	RET
   626  
   627  // void	runtime·atomicand8(byte volatile*, byte);
   628  TEXT runtime·atomicand8(SB), NOSPLIT, $0-5
   629  	MOVL	ptr+0(FP), AX
   630  	MOVB	val+4(FP), BX
   631  	LOCK
   632  	ANDB	BX, (AX)
   633  	RET
   634  
   635  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   636  	// Stores are already ordered on x86, so this is just a
   637  	// compile barrier.
   638  	RET
   639  
   640  // void jmpdefer(fn, sp);
   641  // called from deferreturn.
   642  // 1. pop the caller
   643  // 2. sub 5 bytes from the callers return
   644  // 3. jmp to the argument
   645  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   646  	MOVL	fv+0(FP), DX	// fn
   647  	MOVL	argp+4(FP), BX	// caller sp
   648  	LEAL	-4(BX), SP	// caller sp after CALL
   649  	SUBL	$5, (SP)	// return to CALL again
   650  	MOVL	0(DX), BX
   651  	JMP	BX	// but first run the deferred function
   652  
   653  // Save state of caller into g->sched.
   654  TEXT gosave<>(SB),NOSPLIT,$0
   655  	PUSHL	AX
   656  	PUSHL	BX
   657  	get_tls(BX)
   658  	MOVL	g(BX), BX
   659  	LEAL	arg+0(FP), AX
   660  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   661  	MOVL	-4(AX), AX
   662  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   663  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   664  	MOVL	$0, (g_sched+gobuf_ctxt)(BX)
   665  	POPL	BX
   666  	POPL	AX
   667  	RET
   668  
   669  // func asmcgocall(fn, arg unsafe.Pointer) int32
   670  // Call fn(arg) on the scheduler stack,
   671  // aligned appropriately for the gcc ABI.
   672  // See cgocall.go for more details.
   673  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   674  	MOVL	fn+0(FP), AX
   675  	MOVL	arg+4(FP), BX
   676  
   677  	MOVL	SP, DX
   678  
   679  	// Figure out if we need to switch to m->g0 stack.
   680  	// We get called to create new OS threads too, and those
   681  	// come in on the m->g0 stack already.
   682  	get_tls(CX)
   683  	MOVL	g(CX), BP
   684  	MOVL	g_m(BP), BP
   685  	MOVL	m_g0(BP), SI
   686  	MOVL	g(CX), DI
   687  	CMPL	SI, DI
   688  	JEQ	4(PC)
   689  	CALL	gosave<>(SB)
   690  	MOVL	SI, g(CX)
   691  	MOVL	(g_sched+gobuf_sp)(SI), SP
   692  
   693  	// Now on a scheduling stack (a pthread-created stack).
   694  	SUBL	$32, SP
   695  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   696  	MOVL	DI, 8(SP)	// save g
   697  	MOVL	(g_stack+stack_hi)(DI), DI
   698  	SUBL	DX, DI
   699  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   700  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   701  	CALL	AX
   702  
   703  	// Restore registers, g, stack pointer.
   704  	get_tls(CX)
   705  	MOVL	8(SP), DI
   706  	MOVL	(g_stack+stack_hi)(DI), SI
   707  	SUBL	4(SP), SI
   708  	MOVL	DI, g(CX)
   709  	MOVL	SI, SP
   710  
   711  	MOVL	AX, ret+8(FP)
   712  	RET
   713  
   714  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   715  // Turn the fn into a Go func (by taking its address) and call
   716  // cgocallback_gofunc.
   717  TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
   718  	LEAL	fn+0(FP), AX
   719  	MOVL	AX, 0(SP)
   720  	MOVL	frame+4(FP), AX
   721  	MOVL	AX, 4(SP)
   722  	MOVL	framesize+8(FP), AX
   723  	MOVL	AX, 8(SP)
   724  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   725  	CALL	AX
   726  	RET
   727  
   728  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   729  // See cgocall.go for more details.
   730  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-12
   731  	NO_LOCAL_POINTERS
   732  
   733  	// If g is nil, Go did not create the current thread.
   734  	// Call needm to obtain one for temporary use.
   735  	// In this case, we're running on the thread stack, so there's
   736  	// lots of space, but the linker doesn't know. Hide the call from
   737  	// the linker analysis by using an indirect call through AX.
   738  	get_tls(CX)
   739  #ifdef GOOS_windows
   740  	MOVL	$0, BP
   741  	CMPL	CX, $0
   742  	JEQ	2(PC) // TODO
   743  #endif
   744  	MOVL	g(CX), BP
   745  	CMPL	BP, $0
   746  	JEQ	needm
   747  	MOVL	g_m(BP), BP
   748  	MOVL	BP, DX // saved copy of oldm
   749  	JMP	havem
   750  needm:
   751  	MOVL	$0, 0(SP)
   752  	MOVL	$runtime·needm(SB), AX
   753  	CALL	AX
   754  	MOVL	0(SP), DX
   755  	get_tls(CX)
   756  	MOVL	g(CX), BP
   757  	MOVL	g_m(BP), BP
   758  
   759  	// Set m->sched.sp = SP, so that if a panic happens
   760  	// during the function we are about to execute, it will
   761  	// have a valid SP to run on the g0 stack.
   762  	// The next few lines (after the havem label)
   763  	// will save this SP onto the stack and then write
   764  	// the same SP back to m->sched.sp. That seems redundant,
   765  	// but if an unrecovered panic happens, unwindm will
   766  	// restore the g->sched.sp from the stack location
   767  	// and then systemstack will try to use it. If we don't set it here,
   768  	// that restored SP will be uninitialized (typically 0) and
   769  	// will not be usable.
   770  	MOVL	m_g0(BP), SI
   771  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   772  
   773  havem:
   774  	// Now there's a valid m, and we're running on its m->g0.
   775  	// Save current m->g0->sched.sp on stack and then set it to SP.
   776  	// Save current sp in m->g0->sched.sp in preparation for
   777  	// switch back to m->curg stack.
   778  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   779  	MOVL	m_g0(BP), SI
   780  	MOVL	(g_sched+gobuf_sp)(SI), AX
   781  	MOVL	AX, 0(SP)
   782  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   783  
   784  	// Switch to m->curg stack and call runtime.cgocallbackg.
   785  	// Because we are taking over the execution of m->curg
   786  	// but *not* resuming what had been running, we need to
   787  	// save that information (m->curg->sched) so we can restore it.
   788  	// We can restore m->curg->sched.sp easily, because calling
   789  	// runtime.cgocallbackg leaves SP unchanged upon return.
   790  	// To save m->curg->sched.pc, we push it onto the stack.
   791  	// This has the added benefit that it looks to the traceback
   792  	// routine like cgocallbackg is going to return to that
   793  	// PC (because the frame we allocate below has the same
   794  	// size as cgocallback_gofunc's frame declared above)
   795  	// so that the traceback will seamlessly trace back into
   796  	// the earlier calls.
   797  	//
   798  	// In the new goroutine, 0(SP) holds the saved oldm (DX) register.
   799  	// 4(SP) and 8(SP) are unused.
   800  	MOVL	m_curg(BP), SI
   801  	MOVL	SI, g(CX)
   802  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   803  	MOVL	(g_sched+gobuf_pc)(SI), BP
   804  	MOVL	BP, -4(DI)
   805  	LEAL	-(4+12)(DI), SP
   806  	MOVL	DX, 0(SP)
   807  	CALL	runtime·cgocallbackg(SB)
   808  	MOVL	0(SP), DX
   809  
   810  	// Restore g->sched (== m->curg->sched) from saved values.
   811  	get_tls(CX)
   812  	MOVL	g(CX), SI
   813  	MOVL	12(SP), BP
   814  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   815  	LEAL	(12+4)(SP), DI
   816  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   817  
   818  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   819  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   820  	// so we do not have to restore it.)
   821  	MOVL	g(CX), BP
   822  	MOVL	g_m(BP), BP
   823  	MOVL	m_g0(BP), SI
   824  	MOVL	SI, g(CX)
   825  	MOVL	(g_sched+gobuf_sp)(SI), SP
   826  	MOVL	0(SP), AX
   827  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   828  	
   829  	// If the m on entry was nil, we called needm above to borrow an m
   830  	// for the duration of the call. Since the call is over, return it with dropm.
   831  	CMPL	DX, $0
   832  	JNE 3(PC)
   833  	MOVL	$runtime·dropm(SB), AX
   834  	CALL	AX
   835  
   836  	// Done!
   837  	RET
   838  
   839  // void setg(G*); set g. for use by needm.
   840  TEXT runtime·setg(SB), NOSPLIT, $0-4
   841  	MOVL	gg+0(FP), BX
   842  #ifdef GOOS_windows
   843  	CMPL	BX, $0
   844  	JNE	settls
   845  	MOVL	$0, 0x14(FS)
   846  	RET
   847  settls:
   848  	MOVL	g_m(BX), AX
   849  	LEAL	m_tls(AX), AX
   850  	MOVL	AX, 0x14(FS)
   851  #endif
   852  	get_tls(CX)
   853  	MOVL	BX, g(CX)
   854  	RET
   855  
   856  // void setg_gcc(G*); set g. for use by gcc
   857  TEXT setg_gcc<>(SB), NOSPLIT, $0
   858  	get_tls(AX)
   859  	MOVL	gg+0(FP), DX
   860  	MOVL	DX, g(AX)
   861  	RET
   862  
   863  // check that SP is in range [g->stack.lo, g->stack.hi)
   864  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   865  	get_tls(CX)
   866  	MOVL	g(CX), AX
   867  	CMPL	(g_stack+stack_hi)(AX), SP
   868  	JHI	2(PC)
   869  	INT	$3
   870  	CMPL	SP, (g_stack+stack_lo)(AX)
   871  	JHI	2(PC)
   872  	INT	$3
   873  	RET
   874  
   875  TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
   876  	MOVL	argp+0(FP),AX		// addr of first arg
   877  	MOVL	-4(AX),AX		// get calling pc
   878  	CMPL	AX, runtime·stackBarrierPC(SB)
   879  	JNE	nobar
   880  	// Get original return PC.
   881  	CALL	runtime·nextBarrierPC(SB)
   882  	MOVL	0(SP), AX
   883  nobar:
   884  	MOVL	AX, ret+4(FP)
   885  	RET
   886  
   887  TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
   888  	MOVL	argp+0(FP),AX		// addr of first arg
   889  	MOVL	pc+4(FP), BX
   890  	MOVL	-4(AX), CX
   891  	CMPL	CX, runtime·stackBarrierPC(SB)
   892  	JEQ	setbar
   893  	MOVL	BX, -4(AX)		// set calling pc
   894  	RET
   895  setbar:
   896  	// Set the stack barrier return PC.
   897  	MOVL	BX, 0(SP)
   898  	CALL	runtime·setNextBarrierPC(SB)
   899  	RET
   900  
   901  TEXT runtime·getcallersp(SB), NOSPLIT, $0-8
   902  	MOVL	argp+0(FP), AX
   903  	MOVL	AX, ret+4(FP)
   904  	RET
   905  
   906  // func cputicks() int64
   907  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   908  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence
   909  	JEQ	done
   910  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   911  	JNE	mfence
   912  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   913  	JMP	done
   914  mfence:
   915  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   916  done:
   917  	RDTSC
   918  	MOVL	AX, ret_lo+0(FP)
   919  	MOVL	DX, ret_hi+4(FP)
   920  	RET
   921  
   922  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   923  	// set up ldt 7 to point at tls0
   924  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   925  	// the entry number is just a hint.  setldt will set up GS with what it used.
   926  	MOVL	$7, 0(SP)
   927  	LEAL	runtime·tls0(SB), AX
   928  	MOVL	AX, 4(SP)
   929  	MOVL	$32, 8(SP)	// sizeof(tls array)
   930  	CALL	runtime·setldt(SB)
   931  	RET
   932  
   933  TEXT runtime·emptyfunc(SB),0,$0-0
   934  	RET
   935  
   936  TEXT runtime·abort(SB),NOSPLIT,$0-0
   937  	INT $0x3
   938  
   939  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   940  // redirects to memhash(p, h, size) using the size
   941  // stored in the closure.
   942  TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
   943  	GO_ARGS
   944  	NO_LOCAL_POINTERS
   945  	MOVL	p+0(FP), AX
   946  	MOVL	h+4(FP), BX
   947  	MOVL	4(DX), CX
   948  	MOVL	AX, 0(SP)
   949  	MOVL	BX, 4(SP)
   950  	MOVL	CX, 8(SP)
   951  	CALL	runtime·memhash(SB)
   952  	MOVL	12(SP), AX
   953  	MOVL	AX, ret+8(FP)
   954  	RET
   955  
   956  // hash function using AES hardware instructions
   957  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   958  	MOVL	p+0(FP), AX	// ptr to data
   959  	MOVL	s+8(FP), CX	// size
   960  	LEAL	ret+12(FP), DX
   961  	JMP	runtime·aeshashbody(SB)
   962  
   963  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   964  	MOVL	p+0(FP), AX	// ptr to string object
   965  	MOVL	4(AX), CX	// length of string
   966  	MOVL	(AX), AX	// string data
   967  	LEAL	ret+8(FP), DX
   968  	JMP	runtime·aeshashbody(SB)
   969  
   970  // AX: data
   971  // CX: length
   972  // DX: address to put return value
   973  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   974  	MOVL	h+4(FP), X6	// seed to low 64 bits of xmm6
   975  	PINSRD	$2, CX, X6	// size to high 64 bits of xmm6
   976  	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
   977  	MOVO	runtime·aeskeysched(SB), X7
   978  	CMPL	CX, $16
   979  	JB	aes0to15
   980  	JE	aes16
   981  	CMPL	CX, $32
   982  	JBE	aes17to32
   983  	CMPL	CX, $64
   984  	JBE	aes33to64
   985  	JMP	aes65plus
   986  	
   987  aes0to15:
   988  	TESTL	CX, CX
   989  	JE	aes0
   990  
   991  	ADDL	$16, AX
   992  	TESTW	$0xff0, AX
   993  	JE	endofpage
   994  
   995  	// 16 bytes loaded at this address won't cross
   996  	// a page boundary, so we can load it directly.
   997  	MOVOU	-16(AX), X0
   998  	ADDL	CX, CX
   999  	PAND	masks<>(SB)(CX*8), X0
  1000  
  1001  	// scramble 3 times
  1002  	AESENC	X6, X0
  1003  	AESENC	X7, X0
  1004  	AESENC	X7, X0
  1005  	MOVL	X0, (DX)
  1006  	RET
  1007  
  1008  endofpage:
  1009  	// address ends in 1111xxxx.  Might be up against
  1010  	// a page boundary, so load ending at last byte.
  1011  	// Then shift bytes down using pshufb.
  1012  	MOVOU	-32(AX)(CX*1), X0
  1013  	ADDL	CX, CX
  1014  	PSHUFB	shifts<>(SB)(CX*8), X0
  1015  	AESENC	X6, X0
  1016  	AESENC	X7, X0
  1017  	AESENC	X7, X0
  1018  	MOVL	X0, (DX)
  1019  	RET
  1020  
  1021  aes0:
  1022  	// return input seed
  1023  	MOVL	h+4(FP), AX
  1024  	MOVL	AX, (DX)
  1025  	RET
  1026  
  1027  aes16:
  1028  	MOVOU	(AX), X0
  1029  	AESENC	X6, X0
  1030  	AESENC	X7, X0
  1031  	AESENC	X7, X0
  1032  	MOVL	X0, (DX)
  1033  	RET
  1034  
  1035  
  1036  aes17to32:
  1037  	// load data to be hashed
  1038  	MOVOU	(AX), X0
  1039  	MOVOU	-16(AX)(CX*1), X1
  1040  
  1041  	// scramble 3 times
  1042  	AESENC	X6, X0
  1043  	AESENC	runtime·aeskeysched+16(SB), X1
  1044  	AESENC	X7, X0
  1045  	AESENC	X7, X1
  1046  	AESENC	X7, X0
  1047  	AESENC	X7, X1
  1048  
  1049  	// combine results
  1050  	PXOR	X1, X0
  1051  	MOVL	X0, (DX)
  1052  	RET
  1053  
  1054  aes33to64:
  1055  	MOVOU	(AX), X0
  1056  	MOVOU	16(AX), X1
  1057  	MOVOU	-32(AX)(CX*1), X2
  1058  	MOVOU	-16(AX)(CX*1), X3
  1059  	
  1060  	AESENC	X6, X0
  1061  	AESENC	runtime·aeskeysched+16(SB), X1
  1062  	AESENC	runtime·aeskeysched+32(SB), X2
  1063  	AESENC	runtime·aeskeysched+48(SB), X3
  1064  	AESENC	X7, X0
  1065  	AESENC	X7, X1
  1066  	AESENC	X7, X2
  1067  	AESENC	X7, X3
  1068  	AESENC	X7, X0
  1069  	AESENC	X7, X1
  1070  	AESENC	X7, X2
  1071  	AESENC	X7, X3
  1072  
  1073  	PXOR	X2, X0
  1074  	PXOR	X3, X1
  1075  	PXOR	X1, X0
  1076  	MOVL	X0, (DX)
  1077  	RET
  1078  
  1079  aes65plus:
  1080  	// start with last (possibly overlapping) block
  1081  	MOVOU	-64(AX)(CX*1), X0
  1082  	MOVOU	-48(AX)(CX*1), X1
  1083  	MOVOU	-32(AX)(CX*1), X2
  1084  	MOVOU	-16(AX)(CX*1), X3
  1085  
  1086  	// scramble state once
  1087  	AESENC	X6, X0
  1088  	AESENC	runtime·aeskeysched+16(SB), X1
  1089  	AESENC	runtime·aeskeysched+32(SB), X2
  1090  	AESENC	runtime·aeskeysched+48(SB), X3
  1091  
  1092  	// compute number of remaining 64-byte blocks
  1093  	DECL	CX
  1094  	SHRL	$6, CX
  1095  	
  1096  aesloop:
  1097  	// scramble state, xor in a block
  1098  	MOVOU	(AX), X4
  1099  	MOVOU	16(AX), X5
  1100  	AESENC	X4, X0
  1101  	AESENC	X5, X1
  1102  	MOVOU	32(AX), X4
  1103  	MOVOU	48(AX), X5
  1104  	AESENC	X4, X2
  1105  	AESENC	X5, X3
  1106  
  1107  	// scramble state
  1108  	AESENC	X7, X0
  1109  	AESENC	X7, X1
  1110  	AESENC	X7, X2
  1111  	AESENC	X7, X3
  1112  
  1113  	ADDL	$64, AX
  1114  	DECL	CX
  1115  	JNE	aesloop
  1116  
  1117  	// 2 more scrambles to finish
  1118  	AESENC	X7, X0
  1119  	AESENC	X7, X1
  1120  	AESENC	X7, X2
  1121  	AESENC	X7, X3
  1122  	AESENC	X7, X0
  1123  	AESENC	X7, X1
  1124  	AESENC	X7, X2
  1125  	AESENC	X7, X3
  1126  
  1127  	PXOR	X2, X0
  1128  	PXOR	X3, X1
  1129  	PXOR	X1, X0
  1130  	MOVL	X0, (DX)
  1131  	RET
  1132  
  1133  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1134  	MOVL	p+0(FP), AX	// ptr to data
  1135  	MOVL	h+4(FP), X0	// seed
  1136  	PINSRD	$1, (AX), X0	// data
  1137  	AESENC	runtime·aeskeysched+0(SB), X0
  1138  	AESENC	runtime·aeskeysched+16(SB), X0
  1139  	AESENC	runtime·aeskeysched+32(SB), X0
  1140  	MOVL	X0, ret+8(FP)
  1141  	RET
  1142  
  1143  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1144  	MOVL	p+0(FP), AX	// ptr to data
  1145  	MOVQ	(AX), X0	// data
  1146  	PINSRD	$2, h+4(FP), X0	// seed
  1147  	AESENC	runtime·aeskeysched+0(SB), X0
  1148  	AESENC	runtime·aeskeysched+16(SB), X0
  1149  	AESENC	runtime·aeskeysched+32(SB), X0
  1150  	MOVL	X0, ret+8(FP)
  1151  	RET
  1152  
  1153  // simple mask to get rid of data in the high part of the register.
  1154  DATA masks<>+0x00(SB)/4, $0x00000000
  1155  DATA masks<>+0x04(SB)/4, $0x00000000
  1156  DATA masks<>+0x08(SB)/4, $0x00000000
  1157  DATA masks<>+0x0c(SB)/4, $0x00000000
  1158  	
  1159  DATA masks<>+0x10(SB)/4, $0x000000ff
  1160  DATA masks<>+0x14(SB)/4, $0x00000000
  1161  DATA masks<>+0x18(SB)/4, $0x00000000
  1162  DATA masks<>+0x1c(SB)/4, $0x00000000
  1163  	
  1164  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1165  DATA masks<>+0x24(SB)/4, $0x00000000
  1166  DATA masks<>+0x28(SB)/4, $0x00000000
  1167  DATA masks<>+0x2c(SB)/4, $0x00000000
  1168  	
  1169  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1170  DATA masks<>+0x34(SB)/4, $0x00000000
  1171  DATA masks<>+0x38(SB)/4, $0x00000000
  1172  DATA masks<>+0x3c(SB)/4, $0x00000000
  1173  	
  1174  DATA masks<>+0x40(SB)/4, $0xffffffff
  1175  DATA masks<>+0x44(SB)/4, $0x00000000
  1176  DATA masks<>+0x48(SB)/4, $0x00000000
  1177  DATA masks<>+0x4c(SB)/4, $0x00000000
  1178  	
  1179  DATA masks<>+0x50(SB)/4, $0xffffffff
  1180  DATA masks<>+0x54(SB)/4, $0x000000ff
  1181  DATA masks<>+0x58(SB)/4, $0x00000000
  1182  DATA masks<>+0x5c(SB)/4, $0x00000000
  1183  	
  1184  DATA masks<>+0x60(SB)/4, $0xffffffff
  1185  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1186  DATA masks<>+0x68(SB)/4, $0x00000000
  1187  DATA masks<>+0x6c(SB)/4, $0x00000000
  1188  	
  1189  DATA masks<>+0x70(SB)/4, $0xffffffff
  1190  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1191  DATA masks<>+0x78(SB)/4, $0x00000000
  1192  DATA masks<>+0x7c(SB)/4, $0x00000000
  1193  	
  1194  DATA masks<>+0x80(SB)/4, $0xffffffff
  1195  DATA masks<>+0x84(SB)/4, $0xffffffff
  1196  DATA masks<>+0x88(SB)/4, $0x00000000
  1197  DATA masks<>+0x8c(SB)/4, $0x00000000
  1198  	
  1199  DATA masks<>+0x90(SB)/4, $0xffffffff
  1200  DATA masks<>+0x94(SB)/4, $0xffffffff
  1201  DATA masks<>+0x98(SB)/4, $0x000000ff
  1202  DATA masks<>+0x9c(SB)/4, $0x00000000
  1203  	
  1204  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1205  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1206  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1207  DATA masks<>+0xac(SB)/4, $0x00000000
  1208  	
  1209  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1210  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1211  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1212  DATA masks<>+0xbc(SB)/4, $0x00000000
  1213  	
  1214  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1215  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1216  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1217  DATA masks<>+0xcc(SB)/4, $0x00000000
  1218  	
  1219  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1220  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1221  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1222  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1223  	
  1224  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1225  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1226  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1227  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1228  	
  1229  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1230  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1231  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1232  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1233  
  1234  GLOBL masks<>(SB),RODATA,$256
  1235  
  1236  // these are arguments to pshufb.  They move data down from
  1237  // the high bytes of the register to the low bytes of the register.
  1238  // index is how many bytes to move.
  1239  DATA shifts<>+0x00(SB)/4, $0x00000000
  1240  DATA shifts<>+0x04(SB)/4, $0x00000000
  1241  DATA shifts<>+0x08(SB)/4, $0x00000000
  1242  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1243  	
  1244  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1245  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1246  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1247  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1248  	
  1249  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1250  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1251  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1252  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1253  	
  1254  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1255  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1256  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1257  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1258  	
  1259  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1260  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1261  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1262  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1263  	
  1264  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1265  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1266  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1267  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1268  	
  1269  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1270  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1271  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1272  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1273  	
  1274  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1275  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1276  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1277  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1278  	
  1279  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1280  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1281  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1282  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1283  	
  1284  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1285  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1286  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1287  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1288  	
  1289  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1290  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1291  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1292  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1293  	
  1294  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1295  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1296  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1297  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1298  	
  1299  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1300  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1301  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1302  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1303  	
  1304  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1305  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1306  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1307  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1308  	
  1309  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1310  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1311  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1312  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1313  	
  1314  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1315  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1316  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1317  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1318  
  1319  GLOBL shifts<>(SB),RODATA,$256
  1320  
  1321  TEXT runtime·memeq(SB),NOSPLIT,$0-13
  1322  	MOVL	a+0(FP), SI
  1323  	MOVL	b+4(FP), DI
  1324  	MOVL	size+8(FP), BX
  1325  	LEAL	ret+12(FP), AX
  1326  	JMP	runtime·memeqbody(SB)
  1327  
  1328  // memequal_varlen(a, b unsafe.Pointer) bool
  1329  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1330  	MOVL    a+0(FP), SI
  1331  	MOVL    b+4(FP), DI
  1332  	CMPL    SI, DI
  1333  	JEQ     eq
  1334  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1335  	LEAL	ret+8(FP), AX
  1336  	JMP	runtime·memeqbody(SB)
  1337  eq:
  1338  	MOVB    $1, ret+8(FP)
  1339  	RET
  1340  
  1341  // eqstring tests whether two strings are equal.
  1342  // The compiler guarantees that strings passed
  1343  // to eqstring have equal length.
  1344  // See runtime_test.go:eqstring_generic for
  1345  // equivalent Go code.
  1346  TEXT runtime·eqstring(SB),NOSPLIT,$0-17
  1347  	MOVL	s1str+0(FP), SI
  1348  	MOVL	s2str+8(FP), DI
  1349  	CMPL	SI, DI
  1350  	JEQ	same
  1351  	MOVL	s1len+4(FP), BX
  1352  	LEAL	v+16(FP), AX
  1353  	JMP	runtime·memeqbody(SB)
  1354  same:
  1355  	MOVB	$1, v+16(FP)
  1356  	RET
  1357  
  1358  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1359  	MOVL	a_len+4(FP), BX
  1360  	MOVL	b_len+16(FP), CX
  1361  	CMPL	BX, CX
  1362  	JNE	eqret
  1363  	MOVL	a+0(FP), SI
  1364  	MOVL	b+12(FP), DI
  1365  	LEAL	ret+24(FP), AX
  1366  	JMP	runtime·memeqbody(SB)
  1367  eqret:
  1368  	MOVB	$0, ret+24(FP)
  1369  	RET
  1370  
  1371  // a in SI
  1372  // b in DI
  1373  // count in BX
  1374  // address of result byte in AX
  1375  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1376  	CMPL	BX, $4
  1377  	JB	small
  1378  
  1379  	// 64 bytes at a time using xmm registers
  1380  hugeloop:
  1381  	CMPL	BX, $64
  1382  	JB	bigloop
  1383  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1384  	JE	bigloop
  1385  	MOVOU	(SI), X0
  1386  	MOVOU	(DI), X1
  1387  	MOVOU	16(SI), X2
  1388  	MOVOU	16(DI), X3
  1389  	MOVOU	32(SI), X4
  1390  	MOVOU	32(DI), X5
  1391  	MOVOU	48(SI), X6
  1392  	MOVOU	48(DI), X7
  1393  	PCMPEQB	X1, X0
  1394  	PCMPEQB	X3, X2
  1395  	PCMPEQB	X5, X4
  1396  	PCMPEQB	X7, X6
  1397  	PAND	X2, X0
  1398  	PAND	X6, X4
  1399  	PAND	X4, X0
  1400  	PMOVMSKB X0, DX
  1401  	ADDL	$64, SI
  1402  	ADDL	$64, DI
  1403  	SUBL	$64, BX
  1404  	CMPL	DX, $0xffff
  1405  	JEQ	hugeloop
  1406  	MOVB	$0, (AX)
  1407  	RET
  1408  
  1409  	// 4 bytes at a time using 32-bit register
  1410  bigloop:
  1411  	CMPL	BX, $4
  1412  	JBE	leftover
  1413  	MOVL	(SI), CX
  1414  	MOVL	(DI), DX
  1415  	ADDL	$4, SI
  1416  	ADDL	$4, DI
  1417  	SUBL	$4, BX
  1418  	CMPL	CX, DX
  1419  	JEQ	bigloop
  1420  	MOVB	$0, (AX)
  1421  	RET
  1422  
  1423  	// remaining 0-4 bytes
  1424  leftover:
  1425  	MOVL	-4(SI)(BX*1), CX
  1426  	MOVL	-4(DI)(BX*1), DX
  1427  	CMPL	CX, DX
  1428  	SETEQ	(AX)
  1429  	RET
  1430  
  1431  small:
  1432  	CMPL	BX, $0
  1433  	JEQ	equal
  1434  
  1435  	LEAL	0(BX*8), CX
  1436  	NEGL	CX
  1437  
  1438  	MOVL	SI, DX
  1439  	CMPB	DX, $0xfc
  1440  	JA	si_high
  1441  
  1442  	// load at SI won't cross a page boundary.
  1443  	MOVL	(SI), SI
  1444  	JMP	si_finish
  1445  si_high:
  1446  	// address ends in 111111xx.  Load up to bytes we want, move to correct position.
  1447  	MOVL	-4(SI)(BX*1), SI
  1448  	SHRL	CX, SI
  1449  si_finish:
  1450  
  1451  	// same for DI.
  1452  	MOVL	DI, DX
  1453  	CMPB	DX, $0xfc
  1454  	JA	di_high
  1455  	MOVL	(DI), DI
  1456  	JMP	di_finish
  1457  di_high:
  1458  	MOVL	-4(DI)(BX*1), DI
  1459  	SHRL	CX, DI
  1460  di_finish:
  1461  
  1462  	SUBL	SI, DI
  1463  	SHLL	CX, DI
  1464  equal:
  1465  	SETEQ	(AX)
  1466  	RET
  1467  
  1468  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1469  	MOVL	s1_base+0(FP), SI
  1470  	MOVL	s1_len+4(FP), BX
  1471  	MOVL	s2_base+8(FP), DI
  1472  	MOVL	s2_len+12(FP), DX
  1473  	LEAL	ret+16(FP), AX
  1474  	JMP	runtime·cmpbody(SB)
  1475  
  1476  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1477  	MOVL	s1+0(FP), SI
  1478  	MOVL	s1+4(FP), BX
  1479  	MOVL	s2+12(FP), DI
  1480  	MOVL	s2+16(FP), DX
  1481  	LEAL	ret+24(FP), AX
  1482  	JMP	runtime·cmpbody(SB)
  1483  
  1484  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1485  	MOVL	s+0(FP), SI
  1486  	MOVL	s_len+4(FP), CX
  1487  	MOVB	c+12(FP), AL
  1488  	MOVL	SI, DI
  1489  	CLD; REPN; SCASB
  1490  	JZ 3(PC)
  1491  	MOVL	$-1, ret+16(FP)
  1492  	RET
  1493  	SUBL	SI, DI
  1494  	SUBL	$1, DI
  1495  	MOVL	DI, ret+16(FP)
  1496  	RET
  1497  
  1498  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1499  	MOVL	s+0(FP), SI
  1500  	MOVL	s_len+4(FP), CX
  1501  	MOVB	c+8(FP), AL
  1502  	MOVL	SI, DI
  1503  	CLD; REPN; SCASB
  1504  	JZ 3(PC)
  1505  	MOVL	$-1, ret+12(FP)
  1506  	RET
  1507  	SUBL	SI, DI
  1508  	SUBL	$1, DI
  1509  	MOVL	DI, ret+12(FP)
  1510  	RET
  1511  
  1512  // input:
  1513  //   SI = a
  1514  //   DI = b
  1515  //   BX = alen
  1516  //   DX = blen
  1517  //   AX = address of return word (set to 1/0/-1)
  1518  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1519  	MOVL	DX, BP
  1520  	SUBL	BX, DX // DX = blen-alen
  1521  	CMOVLGT	BX, BP // BP = min(alen, blen)
  1522  	CMPL	SI, DI
  1523  	JEQ	allsame
  1524  	CMPL	BP, $4
  1525  	JB	small
  1526  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1527  	JE	mediumloop
  1528  largeloop:
  1529  	CMPL	BP, $16
  1530  	JB	mediumloop
  1531  	MOVOU	(SI), X0
  1532  	MOVOU	(DI), X1
  1533  	PCMPEQB X0, X1
  1534  	PMOVMSKB X1, BX
  1535  	XORL	$0xffff, BX	// convert EQ to NE
  1536  	JNE	diff16	// branch if at least one byte is not equal
  1537  	ADDL	$16, SI
  1538  	ADDL	$16, DI
  1539  	SUBL	$16, BP
  1540  	JMP	largeloop
  1541  
  1542  diff16:
  1543  	BSFL	BX, BX	// index of first byte that differs
  1544  	XORL	DX, DX
  1545  	MOVB	(SI)(BX*1), CX
  1546  	CMPB	CX, (DI)(BX*1)
  1547  	SETHI	DX
  1548  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1549  	MOVL	DX, (AX)
  1550  	RET
  1551  
  1552  mediumloop:
  1553  	CMPL	BP, $4
  1554  	JBE	_0through4
  1555  	MOVL	(SI), BX
  1556  	MOVL	(DI), CX
  1557  	CMPL	BX, CX
  1558  	JNE	diff4
  1559  	ADDL	$4, SI
  1560  	ADDL	$4, DI
  1561  	SUBL	$4, BP
  1562  	JMP	mediumloop
  1563  
  1564  _0through4:
  1565  	MOVL	-4(SI)(BP*1), BX
  1566  	MOVL	-4(DI)(BP*1), CX
  1567  	CMPL	BX, CX
  1568  	JEQ	allsame
  1569  
  1570  diff4:
  1571  	BSWAPL	BX	// reverse order of bytes
  1572  	BSWAPL	CX
  1573  	XORL	BX, CX	// find bit differences
  1574  	BSRL	CX, CX	// index of highest bit difference
  1575  	SHRL	CX, BX	// move a's bit to bottom
  1576  	ANDL	$1, BX	// mask bit
  1577  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1578  	MOVL	BX, (AX)
  1579  	RET
  1580  
  1581  	// 0-3 bytes in common
  1582  small:
  1583  	LEAL	(BP*8), CX
  1584  	NEGL	CX
  1585  	JEQ	allsame
  1586  
  1587  	// load si
  1588  	CMPB	SI, $0xfc
  1589  	JA	si_high
  1590  	MOVL	(SI), SI
  1591  	JMP	si_finish
  1592  si_high:
  1593  	MOVL	-4(SI)(BP*1), SI
  1594  	SHRL	CX, SI
  1595  si_finish:
  1596  	SHLL	CX, SI
  1597  
  1598  	// same for di
  1599  	CMPB	DI, $0xfc
  1600  	JA	di_high
  1601  	MOVL	(DI), DI
  1602  	JMP	di_finish
  1603  di_high:
  1604  	MOVL	-4(DI)(BP*1), DI
  1605  	SHRL	CX, DI
  1606  di_finish:
  1607  	SHLL	CX, DI
  1608  
  1609  	BSWAPL	SI	// reverse order of bytes
  1610  	BSWAPL	DI
  1611  	XORL	SI, DI	// find bit differences
  1612  	JEQ	allsame
  1613  	BSRL	DI, CX	// index of highest bit difference
  1614  	SHRL	CX, SI	// move a's bit to bottom
  1615  	ANDL	$1, SI	// mask bit
  1616  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1617  	MOVL	BX, (AX)
  1618  	RET
  1619  
  1620  	// all the bytes in common are the same, so we just need
  1621  	// to compare the lengths.
  1622  allsame:
  1623  	XORL	BX, BX
  1624  	XORL	CX, CX
  1625  	TESTL	DX, DX
  1626  	SETLT	BX	// 1 if alen > blen
  1627  	SETEQ	CX	// 1 if alen == blen
  1628  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1629  	MOVL	BX, (AX)
  1630  	RET
  1631  
  1632  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1633  	get_tls(CX)
  1634  	MOVL	g(CX), AX
  1635  	MOVL	g_m(AX), AX
  1636  	MOVL	m_fastrand(AX), DX
  1637  	ADDL	DX, DX
  1638  	MOVL	DX, BX
  1639  	XORL	$0x88888eef, DX
  1640  	CMOVLMI	BX, DX
  1641  	MOVL	DX, m_fastrand(AX)
  1642  	MOVL	DX, ret+0(FP)
  1643  	RET
  1644  
  1645  TEXT runtime·return0(SB), NOSPLIT, $0
  1646  	MOVL	$0, AX
  1647  	RET
  1648  
  1649  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1650  // Must obey the gcc calling convention.
  1651  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1652  	get_tls(CX)
  1653  	MOVL	g(CX), AX
  1654  	MOVL	g_m(AX), AX
  1655  	MOVL	m_curg(AX), AX
  1656  	MOVL	(g_stack+stack_hi)(AX), AX
  1657  	RET
  1658  
  1659  // The top-most function running on a goroutine
  1660  // returns to goexit+PCQuantum.
  1661  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1662  	BYTE	$0x90	// NOP
  1663  	CALL	runtime·goexit1(SB)	// does not return
  1664  	// traceback from goexit1 must hit code range of goexit
  1665  	BYTE	$0x90	// NOP
  1666  
  1667  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
  1668  	MOVL	addr+0(FP), AX
  1669  	PREFETCHT0	(AX)
  1670  	RET
  1671  
  1672  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
  1673  	MOVL	addr+0(FP), AX
  1674  	PREFETCHT1	(AX)
  1675  	RET
  1676  
  1677  
  1678  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
  1679  	MOVL	addr+0(FP), AX
  1680  	PREFETCHT2	(AX)
  1681  	RET
  1682  
  1683  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
  1684  	MOVL	addr+0(FP), AX
  1685  	PREFETCHNTA	(AX)
  1686  	RET