github.com/peggyl/go@v0.0.0-20151008231540-ae315999c2d5/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	SUBL	$128, SP		// plenty of scratch
    15  	ANDL	$~15, SP
    16  	MOVL	AX, 120(SP)		// save argc, argv away
    17  	MOVL	BX, 124(SP)
    18  
    19  	// set default stack bounds.
    20  	// _cgo_init may update stackguard.
    21  	MOVL	$runtime·g0(SB), BP
    22  	LEAL	(-64*1024+104)(SP), BX
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	BX, g_stackguard1(BP)
    25  	MOVL	BX, (g_stack+stack_lo)(BP)
    26  	MOVL	SP, (g_stack+stack_hi)(BP)
    27  	
    28  	// find out information about the processor we're on
    29  	MOVL	$0, AX
    30  	CPUID
    31  	CMPL	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  
    46  	MOVL	$1, AX
    47  	CPUID
    48  	MOVL	CX, runtime·cpuid_ecx(SB)
    49  	MOVL	DX, runtime·cpuid_edx(SB)
    50  nocpuinfo:	
    51  
    52  	// if there is an _cgo_init, call it to let it
    53  	// initialize and to set up GS.  if not,
    54  	// we set up GS ourselves.
    55  	MOVL	_cgo_init(SB), AX
    56  	TESTL	AX, AX
    57  	JZ	needtls
    58  	MOVL	$setg_gcc<>(SB), BX
    59  	MOVL	BX, 4(SP)
    60  	MOVL	BP, 0(SP)
    61  	CALL	AX
    62  
    63  	// update stackguard after _cgo_init
    64  	MOVL	$runtime·g0(SB), CX
    65  	MOVL	(g_stack+stack_lo)(CX), AX
    66  	ADDL	$const__StackGuard, AX
    67  	MOVL	AX, g_stackguard0(CX)
    68  	MOVL	AX, g_stackguard1(CX)
    69  
    70  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
    71  	CMPL runtime·iswindows(SB), $0
    72  	JEQ ok
    73  needtls:
    74  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
    75  	CMPL	runtime·isplan9(SB), $1
    76  	JEQ	ok
    77  
    78  	// set up %gs
    79  	CALL	runtime·ldt0setup(SB)
    80  
    81  	// store through it, to make sure it works
    82  	get_tls(BX)
    83  	MOVL	$0x123, g(BX)
    84  	MOVL	runtime·tls0(SB), AX
    85  	CMPL	AX, $0x123
    86  	JEQ	ok
    87  	MOVL	AX, 0	// abort
    88  ok:
    89  	// set up m and g "registers"
    90  	get_tls(BX)
    91  	LEAL	runtime·g0(SB), CX
    92  	MOVL	CX, g(BX)
    93  	LEAL	runtime·m0(SB), AX
    94  
    95  	// save m->g0 = g0
    96  	MOVL	CX, m_g0(AX)
    97  	// save g0->m = m0
    98  	MOVL	AX, g_m(CX)
    99  
   100  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   101  
   102  	// convention is D is always cleared
   103  	CLD
   104  
   105  	CALL	runtime·check(SB)
   106  
   107  	// saved argc, argv
   108  	MOVL	120(SP), AX
   109  	MOVL	AX, 0(SP)
   110  	MOVL	124(SP), AX
   111  	MOVL	AX, 4(SP)
   112  	CALL	runtime·args(SB)
   113  	CALL	runtime·osinit(SB)
   114  	CALL	runtime·schedinit(SB)
   115  
   116  	// create a new goroutine to start program
   117  	PUSHL	$runtime·mainPC(SB)	// entry
   118  	PUSHL	$0	// arg size
   119  	CALL	runtime·newproc(SB)
   120  	POPL	AX
   121  	POPL	AX
   122  
   123  	// start this M
   124  	CALL	runtime·mstart(SB)
   125  
   126  	INT $3
   127  	RET
   128  
   129  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   130  GLOBL	runtime·mainPC(SB),RODATA,$4
   131  
   132  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   133  	INT $3
   134  	RET
   135  
   136  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   137  	// Linux and MinGW start the FPU in extended double precision.
   138  	// Other operating systems use double precision.
   139  	// Change to double precision to match them,
   140  	// and to match other hardware that only has double.
   141  	PUSHL $0x27F
   142  	FLDCW	0(SP)
   143  	POPL AX
   144  	RET
   145  
   146  /*
   147   *  go-routine
   148   */
   149  
   150  // void gosave(Gobuf*)
   151  // save state in Gobuf; setjmp
   152  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   153  	MOVL	buf+0(FP), AX		// gobuf
   154  	LEAL	buf+0(FP), BX		// caller's SP
   155  	MOVL	BX, gobuf_sp(AX)
   156  	MOVL	0(SP), BX		// caller's PC
   157  	MOVL	BX, gobuf_pc(AX)
   158  	MOVL	$0, gobuf_ret(AX)
   159  	MOVL	$0, gobuf_ctxt(AX)
   160  	get_tls(CX)
   161  	MOVL	g(CX), BX
   162  	MOVL	BX, gobuf_g(AX)
   163  	RET
   164  
   165  // void gogo(Gobuf*)
   166  // restore state from Gobuf; longjmp
   167  TEXT runtime·gogo(SB), NOSPLIT, $0-4
   168  	MOVL	buf+0(FP), BX		// gobuf
   169  	MOVL	gobuf_g(BX), DX
   170  	MOVL	0(DX), CX		// make sure g != nil
   171  	get_tls(CX)
   172  	MOVL	DX, g(CX)
   173  	MOVL	gobuf_sp(BX), SP	// restore SP
   174  	MOVL	gobuf_ret(BX), AX
   175  	MOVL	gobuf_ctxt(BX), DX
   176  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   177  	MOVL	$0, gobuf_ret(BX)
   178  	MOVL	$0, gobuf_ctxt(BX)
   179  	MOVL	gobuf_pc(BX), BX
   180  	JMP	BX
   181  
   182  // func mcall(fn func(*g))
   183  // Switch to m->g0's stack, call fn(g).
   184  // Fn must never return.  It should gogo(&g->sched)
   185  // to keep running g.
   186  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   187  	MOVL	fn+0(FP), DI
   188  	
   189  	get_tls(CX)
   190  	MOVL	g(CX), AX	// save state in g->sched
   191  	MOVL	0(SP), BX	// caller's PC
   192  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   193  	LEAL	fn+0(FP), BX	// caller's SP
   194  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   195  	MOVL	AX, (g_sched+gobuf_g)(AX)
   196  
   197  	// switch to m->g0 & its stack, call fn
   198  	MOVL	g(CX), BX
   199  	MOVL	g_m(BX), BX
   200  	MOVL	m_g0(BX), SI
   201  	CMPL	SI, AX	// if g == m->g0 call badmcall
   202  	JNE	3(PC)
   203  	MOVL	$runtime·badmcall(SB), AX
   204  	JMP	AX
   205  	MOVL	SI, g(CX)	// g = m->g0
   206  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   207  	PUSHL	AX
   208  	MOVL	DI, DX
   209  	MOVL	0(DI), DI
   210  	CALL	DI
   211  	POPL	AX
   212  	MOVL	$runtime·badmcall2(SB), AX
   213  	JMP	AX
   214  	RET
   215  
   216  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   217  // of the G stack.  We need to distinguish the routine that
   218  // lives at the bottom of the G stack from the one that lives
   219  // at the top of the system stack because the one at the top of
   220  // the system stack terminates the stack walk (see topofstack()).
   221  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   222  	RET
   223  
   224  // func systemstack(fn func())
   225  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   226  	MOVL	fn+0(FP), DI	// DI = fn
   227  	get_tls(CX)
   228  	MOVL	g(CX), AX	// AX = g
   229  	MOVL	g_m(AX), BX	// BX = m
   230  
   231  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   232  	CMPL	AX, DX
   233  	JEQ	noswitch
   234  
   235  	MOVL	m_g0(BX), DX	// DX = g0
   236  	CMPL	AX, DX
   237  	JEQ	noswitch
   238  
   239  	MOVL	m_curg(BX), BP
   240  	CMPL	AX, BP
   241  	JEQ	switch
   242  	
   243  	// Bad: g is not gsignal, not g0, not curg. What is it?
   244  	// Hide call from linker nosplit analysis.
   245  	MOVL	$runtime·badsystemstack(SB), AX
   246  	CALL	AX
   247  
   248  switch:
   249  	// save our state in g->sched.  Pretend to
   250  	// be systemstack_switch if the G stack is scanned.
   251  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   252  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   253  	MOVL	AX, (g_sched+gobuf_g)(AX)
   254  
   255  	// switch to g0
   256  	MOVL	DX, g(CX)
   257  	MOVL	(g_sched+gobuf_sp)(DX), BX
   258  	// make it look like mstart called systemstack on g0, to stop traceback
   259  	SUBL	$4, BX
   260  	MOVL	$runtime·mstart(SB), DX
   261  	MOVL	DX, 0(BX)
   262  	MOVL	BX, SP
   263  
   264  	// call target function
   265  	MOVL	DI, DX
   266  	MOVL	0(DI), DI
   267  	CALL	DI
   268  
   269  	// switch back to g
   270  	get_tls(CX)
   271  	MOVL	g(CX), AX
   272  	MOVL	g_m(AX), BX
   273  	MOVL	m_curg(BX), AX
   274  	MOVL	AX, g(CX)
   275  	MOVL	(g_sched+gobuf_sp)(AX), SP
   276  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   277  	RET
   278  
   279  noswitch:
   280  	// already on system stack, just call directly
   281  	MOVL	DI, DX
   282  	MOVL	0(DI), DI
   283  	CALL	DI
   284  	RET
   285  
   286  /*
   287   * support for morestack
   288   */
   289  
   290  // Called during function prolog when more stack is needed.
   291  //
   292  // The traceback routines see morestack on a g0 as being
   293  // the top of a stack (for example, morestack calling newstack
   294  // calling the scheduler calling newm calling gc), so we must
   295  // record an argument size. For that purpose, it has no arguments.
   296  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   297  	// Cannot grow scheduler stack (m->g0).
   298  	get_tls(CX)
   299  	MOVL	g(CX), BX
   300  	MOVL	g_m(BX), BX
   301  	MOVL	m_g0(BX), SI
   302  	CMPL	g(CX), SI
   303  	JNE	2(PC)
   304  	INT	$3
   305  
   306  	// Cannot grow signal stack.
   307  	MOVL	m_gsignal(BX), SI
   308  	CMPL	g(CX), SI
   309  	JNE	2(PC)
   310  	INT	$3
   311  
   312  	// Called from f.
   313  	// Set m->morebuf to f's caller.
   314  	MOVL	4(SP), DI	// f's caller's PC
   315  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   316  	LEAL	8(SP), CX	// f's caller's SP
   317  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   318  	get_tls(CX)
   319  	MOVL	g(CX), SI
   320  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   321  
   322  	// Set g->sched to context in f.
   323  	MOVL	0(SP), AX	// f's PC
   324  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   325  	MOVL	SI, (g_sched+gobuf_g)(SI)
   326  	LEAL	4(SP), AX	// f's SP
   327  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   328  	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
   329  
   330  	// Call newstack on m->g0's stack.
   331  	MOVL	m_g0(BX), BP
   332  	MOVL	BP, g(CX)
   333  	MOVL	(g_sched+gobuf_sp)(BP), AX
   334  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   335  	MOVL	AX, SP
   336  	CALL	runtime·newstack(SB)
   337  	MOVL	$0, 0x1003	// crash if newstack returns
   338  	RET
   339  
   340  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   341  	MOVL	$0, DX
   342  	JMP runtime·morestack(SB)
   343  
   344  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   345  	// We came here via a RET to an overwritten return PC.
   346  	// AX may be live. Other registers are available.
   347  
   348  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   349  	get_tls(CX)
   350  	MOVL	g(CX), CX
   351  	MOVL	(g_stkbar+slice_array)(CX), DX
   352  	MOVL	g_stkbarPos(CX), BX
   353  	IMULL	$stkbar__size, BX	// Too big for SIB.
   354  	MOVL	stkbar_savedLRVal(DX)(BX*1), BX
   355  	// Record that this stack barrier was hit.
   356  	ADDL	$1, g_stkbarPos(CX)
   357  	// Jump to the original return PC.
   358  	JMP	BX
   359  
   360  // reflectcall: call a function with the given argument list
   361  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   362  // we don't have variable-sized frames, so we use a small number
   363  // of constant-sized-frame functions to encode a few bits of size in the pc.
   364  // Caution: ugly multiline assembly macros in your future!
   365  
   366  #define DISPATCH(NAME,MAXSIZE)		\
   367  	CMPL	CX, $MAXSIZE;		\
   368  	JA	3(PC);			\
   369  	MOVL	$NAME(SB), AX;		\
   370  	JMP	AX
   371  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   372  
   373  TEXT reflect·call(SB), NOSPLIT, $0-0
   374  	JMP	·reflectcall(SB)
   375  
   376  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   377  	MOVL	argsize+12(FP), CX
   378  	DISPATCH(runtime·call16, 16)
   379  	DISPATCH(runtime·call32, 32)
   380  	DISPATCH(runtime·call64, 64)
   381  	DISPATCH(runtime·call128, 128)
   382  	DISPATCH(runtime·call256, 256)
   383  	DISPATCH(runtime·call512, 512)
   384  	DISPATCH(runtime·call1024, 1024)
   385  	DISPATCH(runtime·call2048, 2048)
   386  	DISPATCH(runtime·call4096, 4096)
   387  	DISPATCH(runtime·call8192, 8192)
   388  	DISPATCH(runtime·call16384, 16384)
   389  	DISPATCH(runtime·call32768, 32768)
   390  	DISPATCH(runtime·call65536, 65536)
   391  	DISPATCH(runtime·call131072, 131072)
   392  	DISPATCH(runtime·call262144, 262144)
   393  	DISPATCH(runtime·call524288, 524288)
   394  	DISPATCH(runtime·call1048576, 1048576)
   395  	DISPATCH(runtime·call2097152, 2097152)
   396  	DISPATCH(runtime·call4194304, 4194304)
   397  	DISPATCH(runtime·call8388608, 8388608)
   398  	DISPATCH(runtime·call16777216, 16777216)
   399  	DISPATCH(runtime·call33554432, 33554432)
   400  	DISPATCH(runtime·call67108864, 67108864)
   401  	DISPATCH(runtime·call134217728, 134217728)
   402  	DISPATCH(runtime·call268435456, 268435456)
   403  	DISPATCH(runtime·call536870912, 536870912)
   404  	DISPATCH(runtime·call1073741824, 1073741824)
   405  	MOVL	$runtime·badreflectcall(SB), AX
   406  	JMP	AX
   407  
   408  #define CALLFN(NAME,MAXSIZE)			\
   409  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   410  	NO_LOCAL_POINTERS;			\
   411  	/* copy arguments to stack */		\
   412  	MOVL	argptr+8(FP), SI;		\
   413  	MOVL	argsize+12(FP), CX;		\
   414  	MOVL	SP, DI;				\
   415  	REP;MOVSB;				\
   416  	/* call function */			\
   417  	MOVL	f+4(FP), DX;			\
   418  	MOVL	(DX), AX; 			\
   419  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   420  	CALL	AX;				\
   421  	/* copy return values back */		\
   422  	MOVL	argptr+8(FP), DI;		\
   423  	MOVL	argsize+12(FP), CX;		\
   424  	MOVL	retoffset+16(FP), BX;		\
   425  	MOVL	SP, SI;				\
   426  	ADDL	BX, DI;				\
   427  	ADDL	BX, SI;				\
   428  	SUBL	BX, CX;				\
   429  	REP;MOVSB;				\
   430  	/* execute write barrier updates */	\
   431  	MOVL	argtype+0(FP), DX;		\
   432  	MOVL	argptr+8(FP), DI;		\
   433  	MOVL	argsize+12(FP), CX;		\
   434  	MOVL	retoffset+16(FP), BX;		\
   435  	MOVL	DX, 0(SP);			\
   436  	MOVL	DI, 4(SP);			\
   437  	MOVL	CX, 8(SP);			\
   438  	MOVL	BX, 12(SP);			\
   439  	CALL	runtime·callwritebarrier(SB);	\
   440  	RET
   441  
   442  CALLFN(·call16, 16)
   443  CALLFN(·call32, 32)
   444  CALLFN(·call64, 64)
   445  CALLFN(·call128, 128)
   446  CALLFN(·call256, 256)
   447  CALLFN(·call512, 512)
   448  CALLFN(·call1024, 1024)
   449  CALLFN(·call2048, 2048)
   450  CALLFN(·call4096, 4096)
   451  CALLFN(·call8192, 8192)
   452  CALLFN(·call16384, 16384)
   453  CALLFN(·call32768, 32768)
   454  CALLFN(·call65536, 65536)
   455  CALLFN(·call131072, 131072)
   456  CALLFN(·call262144, 262144)
   457  CALLFN(·call524288, 524288)
   458  CALLFN(·call1048576, 1048576)
   459  CALLFN(·call2097152, 2097152)
   460  CALLFN(·call4194304, 4194304)
   461  CALLFN(·call8388608, 8388608)
   462  CALLFN(·call16777216, 16777216)
   463  CALLFN(·call33554432, 33554432)
   464  CALLFN(·call67108864, 67108864)
   465  CALLFN(·call134217728, 134217728)
   466  CALLFN(·call268435456, 268435456)
   467  CALLFN(·call536870912, 536870912)
   468  CALLFN(·call1073741824, 1073741824)
   469  
   470  // bool cas(int32 *val, int32 old, int32 new)
   471  // Atomically:
   472  //	if(*val == old){
   473  //		*val = new;
   474  //		return 1;
   475  //	}else
   476  //		return 0;
   477  TEXT runtime·cas(SB), NOSPLIT, $0-13
   478  	MOVL	ptr+0(FP), BX
   479  	MOVL	old+4(FP), AX
   480  	MOVL	new+8(FP), CX
   481  	LOCK
   482  	CMPXCHGL	CX, 0(BX)
   483  	SETEQ	ret+12(FP)
   484  	RET
   485  
   486  TEXT runtime·casuintptr(SB), NOSPLIT, $0-13
   487  	JMP	runtime·cas(SB)
   488  
   489  TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-8
   490  	JMP	runtime·atomicload(SB)
   491  
   492  TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-8
   493  	JMP	runtime·atomicload(SB)
   494  
   495  TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-8
   496  	JMP	runtime·atomicstore(SB)
   497  
   498  // bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
   499  // Atomically:
   500  //	if(*val == *old){
   501  //		*val = new;
   502  //		return 1;
   503  //	} else {
   504  //		return 0;
   505  //	}
   506  TEXT runtime·cas64(SB), NOSPLIT, $0-21
   507  	MOVL	ptr+0(FP), BP
   508  	MOVL	old_lo+4(FP), AX
   509  	MOVL	old_hi+8(FP), DX
   510  	MOVL	new_lo+12(FP), BX
   511  	MOVL	new_hi+16(FP), CX
   512  	LOCK
   513  	CMPXCHG8B	0(BP)
   514  	SETEQ	ret+20(FP)
   515  	RET
   516  
   517  // bool casp(void **p, void *old, void *new)
   518  // Atomically:
   519  //	if(*p == old){
   520  //		*p = new;
   521  //		return 1;
   522  //	}else
   523  //		return 0;
   524  TEXT runtime·casp1(SB), NOSPLIT, $0-13
   525  	MOVL	ptr+0(FP), BX
   526  	MOVL	old+4(FP), AX
   527  	MOVL	new+8(FP), CX
   528  	LOCK
   529  	CMPXCHGL	CX, 0(BX)
   530  	SETEQ	ret+12(FP)
   531  	RET
   532  
   533  // uint32 xadd(uint32 volatile *val, int32 delta)
   534  // Atomically:
   535  //	*val += delta;
   536  //	return *val;
   537  TEXT runtime·xadd(SB), NOSPLIT, $0-12
   538  	MOVL	ptr+0(FP), BX
   539  	MOVL	delta+4(FP), AX
   540  	MOVL	AX, CX
   541  	LOCK
   542  	XADDL	AX, 0(BX)
   543  	ADDL	CX, AX
   544  	MOVL	AX, ret+8(FP)
   545  	RET
   546  
   547  TEXT runtime·xchg(SB), NOSPLIT, $0-12
   548  	MOVL	ptr+0(FP), BX
   549  	MOVL	new+4(FP), AX
   550  	XCHGL	AX, 0(BX)
   551  	MOVL	AX, ret+8(FP)
   552  	RET
   553  
   554  TEXT runtime·xchguintptr(SB), NOSPLIT, $0-12
   555  	JMP	runtime·xchg(SB)
   556  
   557  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   558  	MOVL	cycles+0(FP), AX
   559  again:
   560  	PAUSE
   561  	SUBL	$1, AX
   562  	JNZ	again
   563  	RET
   564  
   565  TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8
   566  	MOVL	ptr+0(FP), BX
   567  	MOVL	val+4(FP), AX
   568  	XCHGL	AX, 0(BX)
   569  	RET
   570  
   571  TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
   572  	MOVL	ptr+0(FP), BX
   573  	MOVL	val+4(FP), AX
   574  	XCHGL	AX, 0(BX)
   575  	RET
   576  
   577  // uint64 atomicload64(uint64 volatile* addr);
   578  TEXT runtime·atomicload64(SB), NOSPLIT, $0-12
   579  	MOVL	ptr+0(FP), AX
   580  	TESTL	$7, AX
   581  	JZ	2(PC)
   582  	MOVL	0, AX // crash with nil ptr deref
   583  	LEAL	ret_lo+4(FP), BX
   584  	// MOVQ (%EAX), %MM0
   585  	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
   586  	// MOVQ %MM0, 0(%EBX)
   587  	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
   588  	// EMMS
   589  	BYTE $0x0F; BYTE $0x77
   590  	RET
   591  
   592  // void runtime·atomicstore64(uint64 volatile* addr, uint64 v);
   593  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12
   594  	MOVL	ptr+0(FP), AX
   595  	TESTL	$7, AX
   596  	JZ	2(PC)
   597  	MOVL	0, AX // crash with nil ptr deref
   598  	// MOVQ and EMMS were introduced on the Pentium MMX.
   599  	// MOVQ 0x8(%ESP), %MM0
   600  	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
   601  	// MOVQ %MM0, (%EAX)
   602  	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
   603  	// EMMS
   604  	BYTE $0x0F; BYTE $0x77
   605  	// This is essentially a no-op, but it provides required memory fencing.
   606  	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
   607  	MOVL	$0, AX
   608  	LOCK
   609  	XADDL	AX, (SP)
   610  	RET
   611  
   612  // void	runtime·atomicor8(byte volatile*, byte);
   613  TEXT runtime·atomicor8(SB), NOSPLIT, $0-5
   614  	MOVL	ptr+0(FP), AX
   615  	MOVB	val+4(FP), BX
   616  	LOCK
   617  	ORB	BX, (AX)
   618  	RET
   619  
   620  // void	runtime·atomicand8(byte volatile*, byte);
   621  TEXT runtime·atomicand8(SB), NOSPLIT, $0-5
   622  	MOVL	ptr+0(FP), AX
   623  	MOVB	val+4(FP), BX
   624  	LOCK
   625  	ANDB	BX, (AX)
   626  	RET
   627  
   628  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   629  	// Stores are already ordered on x86, so this is just a
   630  	// compile barrier.
   631  	RET
   632  
   633  // void jmpdefer(fn, sp);
   634  // called from deferreturn.
   635  // 1. pop the caller
   636  // 2. sub 5 bytes from the callers return
   637  // 3. jmp to the argument
   638  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   639  	MOVL	fv+0(FP), DX	// fn
   640  	MOVL	argp+4(FP), BX	// caller sp
   641  	LEAL	-4(BX), SP	// caller sp after CALL
   642  	SUBL	$5, (SP)	// return to CALL again
   643  	MOVL	0(DX), BX
   644  	JMP	BX	// but first run the deferred function
   645  
   646  // Save state of caller into g->sched.
   647  TEXT gosave<>(SB),NOSPLIT,$0
   648  	PUSHL	AX
   649  	PUSHL	BX
   650  	get_tls(BX)
   651  	MOVL	g(BX), BX
   652  	LEAL	arg+0(FP), AX
   653  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   654  	MOVL	-4(AX), AX
   655  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   656  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   657  	MOVL	$0, (g_sched+gobuf_ctxt)(BX)
   658  	POPL	BX
   659  	POPL	AX
   660  	RET
   661  
   662  // func asmcgocall(fn, arg unsafe.Pointer) int32
   663  // Call fn(arg) on the scheduler stack,
   664  // aligned appropriately for the gcc ABI.
   665  // See cgocall.go for more details.
   666  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   667  	MOVL	fn+0(FP), AX
   668  	MOVL	arg+4(FP), BX
   669  
   670  	MOVL	SP, DX
   671  
   672  	// Figure out if we need to switch to m->g0 stack.
   673  	// We get called to create new OS threads too, and those
   674  	// come in on the m->g0 stack already.
   675  	get_tls(CX)
   676  	MOVL	g(CX), BP
   677  	MOVL	g_m(BP), BP
   678  	MOVL	m_g0(BP), SI
   679  	MOVL	g(CX), DI
   680  	CMPL	SI, DI
   681  	JEQ	4(PC)
   682  	CALL	gosave<>(SB)
   683  	MOVL	SI, g(CX)
   684  	MOVL	(g_sched+gobuf_sp)(SI), SP
   685  
   686  	// Now on a scheduling stack (a pthread-created stack).
   687  	SUBL	$32, SP
   688  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   689  	MOVL	DI, 8(SP)	// save g
   690  	MOVL	(g_stack+stack_hi)(DI), DI
   691  	SUBL	DX, DI
   692  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   693  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   694  	CALL	AX
   695  
   696  	// Restore registers, g, stack pointer.
   697  	get_tls(CX)
   698  	MOVL	8(SP), DI
   699  	MOVL	(g_stack+stack_hi)(DI), SI
   700  	SUBL	4(SP), SI
   701  	MOVL	DI, g(CX)
   702  	MOVL	SI, SP
   703  
   704  	MOVL	AX, ret+8(FP)
   705  	RET
   706  
   707  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   708  // Turn the fn into a Go func (by taking its address) and call
   709  // cgocallback_gofunc.
   710  TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
   711  	LEAL	fn+0(FP), AX
   712  	MOVL	AX, 0(SP)
   713  	MOVL	frame+4(FP), AX
   714  	MOVL	AX, 4(SP)
   715  	MOVL	framesize+8(FP), AX
   716  	MOVL	AX, 8(SP)
   717  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   718  	CALL	AX
   719  	RET
   720  
   721  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   722  // See cgocall.go for more details.
   723  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-12
   724  	NO_LOCAL_POINTERS
   725  
   726  	// If g is nil, Go did not create the current thread.
   727  	// Call needm to obtain one for temporary use.
   728  	// In this case, we're running on the thread stack, so there's
   729  	// lots of space, but the linker doesn't know. Hide the call from
   730  	// the linker analysis by using an indirect call through AX.
   731  	get_tls(CX)
   732  #ifdef GOOS_windows
   733  	MOVL	$0, BP
   734  	CMPL	CX, $0
   735  	JEQ	2(PC) // TODO
   736  #endif
   737  	MOVL	g(CX), BP
   738  	CMPL	BP, $0
   739  	JEQ	needm
   740  	MOVL	g_m(BP), BP
   741  	MOVL	BP, DX // saved copy of oldm
   742  	JMP	havem
   743  needm:
   744  	MOVL	$0, 0(SP)
   745  	MOVL	$runtime·needm(SB), AX
   746  	CALL	AX
   747  	MOVL	0(SP), DX
   748  	get_tls(CX)
   749  	MOVL	g(CX), BP
   750  	MOVL	g_m(BP), BP
   751  
   752  	// Set m->sched.sp = SP, so that if a panic happens
   753  	// during the function we are about to execute, it will
   754  	// have a valid SP to run on the g0 stack.
   755  	// The next few lines (after the havem label)
   756  	// will save this SP onto the stack and then write
   757  	// the same SP back to m->sched.sp. That seems redundant,
   758  	// but if an unrecovered panic happens, unwindm will
   759  	// restore the g->sched.sp from the stack location
   760  	// and then systemstack will try to use it. If we don't set it here,
   761  	// that restored SP will be uninitialized (typically 0) and
   762  	// will not be usable.
   763  	MOVL	m_g0(BP), SI
   764  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   765  
   766  havem:
   767  	// Now there's a valid m, and we're running on its m->g0.
   768  	// Save current m->g0->sched.sp on stack and then set it to SP.
   769  	// Save current sp in m->g0->sched.sp in preparation for
   770  	// switch back to m->curg stack.
   771  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   772  	MOVL	m_g0(BP), SI
   773  	MOVL	(g_sched+gobuf_sp)(SI), AX
   774  	MOVL	AX, 0(SP)
   775  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   776  
   777  	// Switch to m->curg stack and call runtime.cgocallbackg.
   778  	// Because we are taking over the execution of m->curg
   779  	// but *not* resuming what had been running, we need to
   780  	// save that information (m->curg->sched) so we can restore it.
   781  	// We can restore m->curg->sched.sp easily, because calling
   782  	// runtime.cgocallbackg leaves SP unchanged upon return.
   783  	// To save m->curg->sched.pc, we push it onto the stack.
   784  	// This has the added benefit that it looks to the traceback
   785  	// routine like cgocallbackg is going to return to that
   786  	// PC (because the frame we allocate below has the same
   787  	// size as cgocallback_gofunc's frame declared above)
   788  	// so that the traceback will seamlessly trace back into
   789  	// the earlier calls.
   790  	//
   791  	// In the new goroutine, 0(SP) holds the saved oldm (DX) register.
   792  	// 4(SP) and 8(SP) are unused.
   793  	MOVL	m_curg(BP), SI
   794  	MOVL	SI, g(CX)
   795  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   796  	MOVL	(g_sched+gobuf_pc)(SI), BP
   797  	MOVL	BP, -4(DI)
   798  	LEAL	-(4+12)(DI), SP
   799  	MOVL	DX, 0(SP)
   800  	CALL	runtime·cgocallbackg(SB)
   801  	MOVL	0(SP), DX
   802  
   803  	// Restore g->sched (== m->curg->sched) from saved values.
   804  	get_tls(CX)
   805  	MOVL	g(CX), SI
   806  	MOVL	12(SP), BP
   807  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   808  	LEAL	(12+4)(SP), DI
   809  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   810  
   811  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   812  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   813  	// so we do not have to restore it.)
   814  	MOVL	g(CX), BP
   815  	MOVL	g_m(BP), BP
   816  	MOVL	m_g0(BP), SI
   817  	MOVL	SI, g(CX)
   818  	MOVL	(g_sched+gobuf_sp)(SI), SP
   819  	MOVL	0(SP), AX
   820  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   821  	
   822  	// If the m on entry was nil, we called needm above to borrow an m
   823  	// for the duration of the call. Since the call is over, return it with dropm.
   824  	CMPL	DX, $0
   825  	JNE 3(PC)
   826  	MOVL	$runtime·dropm(SB), AX
   827  	CALL	AX
   828  
   829  	// Done!
   830  	RET
   831  
   832  // void setg(G*); set g. for use by needm.
   833  TEXT runtime·setg(SB), NOSPLIT, $0-4
   834  	MOVL	gg+0(FP), BX
   835  #ifdef GOOS_windows
   836  	CMPL	BX, $0
   837  	JNE	settls
   838  	MOVL	$0, 0x14(FS)
   839  	RET
   840  settls:
   841  	MOVL	g_m(BX), AX
   842  	LEAL	m_tls(AX), AX
   843  	MOVL	AX, 0x14(FS)
   844  #endif
   845  	get_tls(CX)
   846  	MOVL	BX, g(CX)
   847  	RET
   848  
   849  // void setg_gcc(G*); set g. for use by gcc
   850  TEXT setg_gcc<>(SB), NOSPLIT, $0
   851  	get_tls(AX)
   852  	MOVL	gg+0(FP), DX
   853  	MOVL	DX, g(AX)
   854  	RET
   855  
   856  // check that SP is in range [g->stack.lo, g->stack.hi)
   857  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   858  	get_tls(CX)
   859  	MOVL	g(CX), AX
   860  	CMPL	(g_stack+stack_hi)(AX), SP
   861  	JHI	2(PC)
   862  	INT	$3
   863  	CMPL	SP, (g_stack+stack_lo)(AX)
   864  	JHI	2(PC)
   865  	INT	$3
   866  	RET
   867  
   868  TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
   869  	MOVL	argp+0(FP),AX		// addr of first arg
   870  	MOVL	-4(AX),AX		// get calling pc
   871  	CMPL	AX, runtime·stackBarrierPC(SB)
   872  	JNE	nobar
   873  	// Get original return PC.
   874  	CALL	runtime·nextBarrierPC(SB)
   875  	MOVL	0(SP), AX
   876  nobar:
   877  	MOVL	AX, ret+4(FP)
   878  	RET
   879  
   880  TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
   881  	MOVL	argp+0(FP),AX		// addr of first arg
   882  	MOVL	pc+4(FP), BX
   883  	MOVL	-4(AX), CX
   884  	CMPL	CX, runtime·stackBarrierPC(SB)
   885  	JEQ	setbar
   886  	MOVL	BX, -4(AX)		// set calling pc
   887  	RET
   888  setbar:
   889  	// Set the stack barrier return PC.
   890  	MOVL	BX, 0(SP)
   891  	CALL	runtime·setNextBarrierPC(SB)
   892  	RET
   893  
   894  TEXT runtime·getcallersp(SB), NOSPLIT, $0-8
   895  	MOVL	argp+0(FP), AX
   896  	MOVL	AX, ret+4(FP)
   897  	RET
   898  
   899  // func cputicks() int64
   900  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   901  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence
   902  	JEQ	done
   903  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   904  	JNE	mfence
   905  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   906  	JMP	done
   907  mfence:
   908  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   909  done:
   910  	RDTSC
   911  	MOVL	AX, ret_lo+0(FP)
   912  	MOVL	DX, ret_hi+4(FP)
   913  	RET
   914  
   915  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   916  	// set up ldt 7 to point at tls0
   917  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   918  	// the entry number is just a hint.  setldt will set up GS with what it used.
   919  	MOVL	$7, 0(SP)
   920  	LEAL	runtime·tls0(SB), AX
   921  	MOVL	AX, 4(SP)
   922  	MOVL	$32, 8(SP)	// sizeof(tls array)
   923  	CALL	runtime·setldt(SB)
   924  	RET
   925  
   926  TEXT runtime·emptyfunc(SB),0,$0-0
   927  	RET
   928  
   929  TEXT runtime·abort(SB),NOSPLIT,$0-0
   930  	INT $0x3
   931  
   932  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   933  // redirects to memhash(p, h, size) using the size
   934  // stored in the closure.
   935  TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
   936  	GO_ARGS
   937  	NO_LOCAL_POINTERS
   938  	MOVL	p+0(FP), AX
   939  	MOVL	h+4(FP), BX
   940  	MOVL	4(DX), CX
   941  	MOVL	AX, 0(SP)
   942  	MOVL	BX, 4(SP)
   943  	MOVL	CX, 8(SP)
   944  	CALL	runtime·memhash(SB)
   945  	MOVL	12(SP), AX
   946  	MOVL	AX, ret+8(FP)
   947  	RET
   948  
   949  // hash function using AES hardware instructions
   950  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   951  	MOVL	p+0(FP), AX	// ptr to data
   952  	MOVL	s+8(FP), CX	// size
   953  	LEAL	ret+12(FP), DX
   954  	JMP	runtime·aeshashbody(SB)
   955  
   956  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   957  	MOVL	p+0(FP), AX	// ptr to string object
   958  	MOVL	4(AX), CX	// length of string
   959  	MOVL	(AX), AX	// string data
   960  	LEAL	ret+8(FP), DX
   961  	JMP	runtime·aeshashbody(SB)
   962  
   963  // AX: data
   964  // CX: length
   965  // DX: address to put return value
   966  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   967  	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
   968  	PINSRW	$4, CX, X0	            // 16 bits of length
   969  	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
   970  	MOVO	X0, X1                      // save unscrambled seed
   971  	PXOR	runtime·aeskeysched(SB), X0 // xor in per-process seed
   972  	AESENC	X0, X0                      // scramble seed
   973  
   974  	CMPL	CX, $16
   975  	JB	aes0to15
   976  	JE	aes16
   977  	CMPL	CX, $32
   978  	JBE	aes17to32
   979  	CMPL	CX, $64
   980  	JBE	aes33to64
   981  	JMP	aes65plus
   982  	
   983  aes0to15:
   984  	TESTL	CX, CX
   985  	JE	aes0
   986  
   987  	ADDL	$16, AX
   988  	TESTW	$0xff0, AX
   989  	JE	endofpage
   990  
   991  	// 16 bytes loaded at this address won't cross
   992  	// a page boundary, so we can load it directly.
   993  	MOVOU	-16(AX), X1
   994  	ADDL	CX, CX
   995  	PAND	masks<>(SB)(CX*8), X1
   996  
   997  final1:	
   998  	AESENC	X0, X1  // scramble input, xor in seed
   999  	AESENC	X1, X1  // scramble combo 2 times
  1000  	AESENC	X1, X1
  1001  	MOVL	X1, (DX)
  1002  	RET
  1003  
  1004  endofpage:
  1005  	// address ends in 1111xxxx.  Might be up against
  1006  	// a page boundary, so load ending at last byte.
  1007  	// Then shift bytes down using pshufb.
  1008  	MOVOU	-32(AX)(CX*1), X1
  1009  	ADDL	CX, CX
  1010  	PSHUFB	shifts<>(SB)(CX*8), X1
  1011  	JMP	final1
  1012  
  1013  aes0:
  1014  	// Return scrambled input seed
  1015  	AESENC	X0, X0
  1016  	MOVL	X0, (DX)
  1017  	RET
  1018  
  1019  aes16:
  1020  	MOVOU	(AX), X1
  1021  	JMP	final1
  1022  
  1023  aes17to32:
  1024  	// make second starting seed
  1025  	PXOR	runtime·aeskeysched+16(SB), X1
  1026  	AESENC	X1, X1
  1027  	
  1028  	// load data to be hashed
  1029  	MOVOU	(AX), X2
  1030  	MOVOU	-16(AX)(CX*1), X3
  1031  
  1032  	// scramble 3 times
  1033  	AESENC	X0, X2
  1034  	AESENC	X1, X3
  1035  	AESENC	X2, X2
  1036  	AESENC	X3, X3
  1037  	AESENC	X2, X2
  1038  	AESENC	X3, X3
  1039  
  1040  	// combine results
  1041  	PXOR	X3, X2
  1042  	MOVL	X2, (DX)
  1043  	RET
  1044  
  1045  aes33to64:
  1046  	// make 3 more starting seeds
  1047  	MOVO	X1, X2
  1048  	MOVO	X1, X3
  1049  	PXOR	runtime·aeskeysched+16(SB), X1
  1050  	PXOR	runtime·aeskeysched+32(SB), X2
  1051  	PXOR	runtime·aeskeysched+48(SB), X3
  1052  	AESENC	X1, X1
  1053  	AESENC	X2, X2
  1054  	AESENC	X3, X3
  1055  	
  1056  	MOVOU	(AX), X4
  1057  	MOVOU	16(AX), X5
  1058  	MOVOU	-32(AX)(CX*1), X6
  1059  	MOVOU	-16(AX)(CX*1), X7
  1060  	
  1061  	AESENC	X0, X4
  1062  	AESENC	X1, X5
  1063  	AESENC	X2, X6
  1064  	AESENC	X3, X7
  1065  	
  1066  	AESENC	X4, X4
  1067  	AESENC	X5, X5
  1068  	AESENC	X6, X6
  1069  	AESENC	X7, X7
  1070  	
  1071  	AESENC	X4, X4
  1072  	AESENC	X5, X5
  1073  	AESENC	X6, X6
  1074  	AESENC	X7, X7
  1075  
  1076  	PXOR	X6, X4
  1077  	PXOR	X7, X5
  1078  	PXOR	X5, X4
  1079  	MOVL	X4, (DX)
  1080  	RET
  1081  
  1082  aes65plus:
  1083  	// make 3 more starting seeds
  1084  	MOVO	X1, X2
  1085  	MOVO	X1, X3
  1086  	PXOR	runtime·aeskeysched+16(SB), X1
  1087  	PXOR	runtime·aeskeysched+32(SB), X2
  1088  	PXOR	runtime·aeskeysched+48(SB), X3
  1089  	AESENC	X1, X1
  1090  	AESENC	X2, X2
  1091  	AESENC	X3, X3
  1092  	
  1093  	// start with last (possibly overlapping) block
  1094  	MOVOU	-64(AX)(CX*1), X4
  1095  	MOVOU	-48(AX)(CX*1), X5
  1096  	MOVOU	-32(AX)(CX*1), X6
  1097  	MOVOU	-16(AX)(CX*1), X7
  1098  
  1099  	// scramble state once
  1100  	AESENC	X0, X4
  1101  	AESENC	X1, X5
  1102  	AESENC	X2, X6
  1103  	AESENC	X3, X7
  1104  
  1105  	// compute number of remaining 64-byte blocks
  1106  	DECL	CX
  1107  	SHRL	$6, CX
  1108  	
  1109  aesloop:
  1110  	// scramble state, xor in a block
  1111  	MOVOU	(AX), X0
  1112  	MOVOU	16(AX), X1
  1113  	MOVOU	32(AX), X2
  1114  	MOVOU	48(AX), X3
  1115  	AESENC	X0, X4
  1116  	AESENC	X1, X5
  1117  	AESENC	X2, X6
  1118  	AESENC	X3, X7
  1119  
  1120  	// scramble state
  1121  	AESENC	X4, X4
  1122  	AESENC	X5, X5
  1123  	AESENC	X6, X6
  1124  	AESENC	X7, X7
  1125  
  1126  	ADDL	$64, AX
  1127  	DECL	CX
  1128  	JNE	aesloop
  1129  
  1130  	// 2 more scrambles to finish
  1131  	AESENC	X4, X4
  1132  	AESENC	X5, X5
  1133  	AESENC	X6, X6
  1134  	AESENC	X7, X7
  1135  	
  1136  	AESENC	X4, X4
  1137  	AESENC	X5, X5
  1138  	AESENC	X6, X6
  1139  	AESENC	X7, X7
  1140  
  1141  	PXOR	X6, X4
  1142  	PXOR	X7, X5
  1143  	PXOR	X5, X4
  1144  	MOVL	X4, (DX)
  1145  	RET
  1146  
  1147  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1148  	MOVL	p+0(FP), AX	// ptr to data
  1149  	MOVL	h+4(FP), X0	// seed
  1150  	PINSRD	$1, (AX), X0	// data
  1151  	AESENC	runtime·aeskeysched+0(SB), X0
  1152  	AESENC	runtime·aeskeysched+16(SB), X0
  1153  	AESENC	runtime·aeskeysched+32(SB), X0
  1154  	MOVL	X0, ret+8(FP)
  1155  	RET
  1156  
  1157  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1158  	MOVL	p+0(FP), AX	// ptr to data
  1159  	MOVQ	(AX), X0	// data
  1160  	PINSRD	$2, h+4(FP), X0	// seed
  1161  	AESENC	runtime·aeskeysched+0(SB), X0
  1162  	AESENC	runtime·aeskeysched+16(SB), X0
  1163  	AESENC	runtime·aeskeysched+32(SB), X0
  1164  	MOVL	X0, ret+8(FP)
  1165  	RET
  1166  
  1167  // simple mask to get rid of data in the high part of the register.
  1168  DATA masks<>+0x00(SB)/4, $0x00000000
  1169  DATA masks<>+0x04(SB)/4, $0x00000000
  1170  DATA masks<>+0x08(SB)/4, $0x00000000
  1171  DATA masks<>+0x0c(SB)/4, $0x00000000
  1172  	
  1173  DATA masks<>+0x10(SB)/4, $0x000000ff
  1174  DATA masks<>+0x14(SB)/4, $0x00000000
  1175  DATA masks<>+0x18(SB)/4, $0x00000000
  1176  DATA masks<>+0x1c(SB)/4, $0x00000000
  1177  	
  1178  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1179  DATA masks<>+0x24(SB)/4, $0x00000000
  1180  DATA masks<>+0x28(SB)/4, $0x00000000
  1181  DATA masks<>+0x2c(SB)/4, $0x00000000
  1182  	
  1183  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1184  DATA masks<>+0x34(SB)/4, $0x00000000
  1185  DATA masks<>+0x38(SB)/4, $0x00000000
  1186  DATA masks<>+0x3c(SB)/4, $0x00000000
  1187  	
  1188  DATA masks<>+0x40(SB)/4, $0xffffffff
  1189  DATA masks<>+0x44(SB)/4, $0x00000000
  1190  DATA masks<>+0x48(SB)/4, $0x00000000
  1191  DATA masks<>+0x4c(SB)/4, $0x00000000
  1192  	
  1193  DATA masks<>+0x50(SB)/4, $0xffffffff
  1194  DATA masks<>+0x54(SB)/4, $0x000000ff
  1195  DATA masks<>+0x58(SB)/4, $0x00000000
  1196  DATA masks<>+0x5c(SB)/4, $0x00000000
  1197  	
  1198  DATA masks<>+0x60(SB)/4, $0xffffffff
  1199  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1200  DATA masks<>+0x68(SB)/4, $0x00000000
  1201  DATA masks<>+0x6c(SB)/4, $0x00000000
  1202  	
  1203  DATA masks<>+0x70(SB)/4, $0xffffffff
  1204  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1205  DATA masks<>+0x78(SB)/4, $0x00000000
  1206  DATA masks<>+0x7c(SB)/4, $0x00000000
  1207  	
  1208  DATA masks<>+0x80(SB)/4, $0xffffffff
  1209  DATA masks<>+0x84(SB)/4, $0xffffffff
  1210  DATA masks<>+0x88(SB)/4, $0x00000000
  1211  DATA masks<>+0x8c(SB)/4, $0x00000000
  1212  	
  1213  DATA masks<>+0x90(SB)/4, $0xffffffff
  1214  DATA masks<>+0x94(SB)/4, $0xffffffff
  1215  DATA masks<>+0x98(SB)/4, $0x000000ff
  1216  DATA masks<>+0x9c(SB)/4, $0x00000000
  1217  	
  1218  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1219  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1220  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1221  DATA masks<>+0xac(SB)/4, $0x00000000
  1222  	
  1223  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1224  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1225  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1226  DATA masks<>+0xbc(SB)/4, $0x00000000
  1227  	
  1228  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1229  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1230  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1231  DATA masks<>+0xcc(SB)/4, $0x00000000
  1232  	
  1233  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1234  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1235  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1236  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1237  	
  1238  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1239  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1240  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1241  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1242  	
  1243  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1244  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1245  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1246  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1247  
  1248  GLOBL masks<>(SB),RODATA,$256
  1249  
  1250  // these are arguments to pshufb.  They move data down from
  1251  // the high bytes of the register to the low bytes of the register.
  1252  // index is how many bytes to move.
  1253  DATA shifts<>+0x00(SB)/4, $0x00000000
  1254  DATA shifts<>+0x04(SB)/4, $0x00000000
  1255  DATA shifts<>+0x08(SB)/4, $0x00000000
  1256  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1257  	
  1258  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1259  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1260  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1261  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1262  	
  1263  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1264  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1265  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1266  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1267  	
  1268  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1269  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1270  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1271  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1272  	
  1273  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1274  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1275  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1276  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1277  	
  1278  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1279  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1280  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1281  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1282  	
  1283  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1284  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1285  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1286  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1287  	
  1288  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1289  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1290  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1291  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1292  	
  1293  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1294  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1295  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1296  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1297  	
  1298  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1299  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1300  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1301  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1302  	
  1303  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1304  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1305  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1306  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1307  	
  1308  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1309  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1310  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1311  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1312  	
  1313  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1314  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1315  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1316  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1317  	
  1318  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1319  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1320  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1321  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1322  	
  1323  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1324  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1325  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1326  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1327  	
  1328  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1329  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1330  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1331  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1332  
  1333  GLOBL shifts<>(SB),RODATA,$256
  1334  
  1335  TEXT runtime·memeq(SB),NOSPLIT,$0-13
  1336  	MOVL	a+0(FP), SI
  1337  	MOVL	b+4(FP), DI
  1338  	MOVL	size+8(FP), BX
  1339  	LEAL	ret+12(FP), AX
  1340  	JMP	runtime·memeqbody(SB)
  1341  
  1342  // memequal_varlen(a, b unsafe.Pointer) bool
  1343  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1344  	MOVL    a+0(FP), SI
  1345  	MOVL    b+4(FP), DI
  1346  	CMPL    SI, DI
  1347  	JEQ     eq
  1348  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1349  	LEAL	ret+8(FP), AX
  1350  	JMP	runtime·memeqbody(SB)
  1351  eq:
  1352  	MOVB    $1, ret+8(FP)
  1353  	RET
  1354  
  1355  // eqstring tests whether two strings are equal.
  1356  // The compiler guarantees that strings passed
  1357  // to eqstring have equal length.
  1358  // See runtime_test.go:eqstring_generic for
  1359  // equivalent Go code.
  1360  TEXT runtime·eqstring(SB),NOSPLIT,$0-17
  1361  	MOVL	s1str+0(FP), SI
  1362  	MOVL	s2str+8(FP), DI
  1363  	CMPL	SI, DI
  1364  	JEQ	same
  1365  	MOVL	s1len+4(FP), BX
  1366  	LEAL	v+16(FP), AX
  1367  	JMP	runtime·memeqbody(SB)
  1368  same:
  1369  	MOVB	$1, v+16(FP)
  1370  	RET
  1371  
  1372  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1373  	MOVL	a_len+4(FP), BX
  1374  	MOVL	b_len+16(FP), CX
  1375  	CMPL	BX, CX
  1376  	JNE	eqret
  1377  	MOVL	a+0(FP), SI
  1378  	MOVL	b+12(FP), DI
  1379  	LEAL	ret+24(FP), AX
  1380  	JMP	runtime·memeqbody(SB)
  1381  eqret:
  1382  	MOVB	$0, ret+24(FP)
  1383  	RET
  1384  
  1385  // a in SI
  1386  // b in DI
  1387  // count in BX
  1388  // address of result byte in AX
  1389  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1390  	CMPL	BX, $4
  1391  	JB	small
  1392  
  1393  	// 64 bytes at a time using xmm registers
  1394  hugeloop:
  1395  	CMPL	BX, $64
  1396  	JB	bigloop
  1397  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1398  	JE	bigloop
  1399  	MOVOU	(SI), X0
  1400  	MOVOU	(DI), X1
  1401  	MOVOU	16(SI), X2
  1402  	MOVOU	16(DI), X3
  1403  	MOVOU	32(SI), X4
  1404  	MOVOU	32(DI), X5
  1405  	MOVOU	48(SI), X6
  1406  	MOVOU	48(DI), X7
  1407  	PCMPEQB	X1, X0
  1408  	PCMPEQB	X3, X2
  1409  	PCMPEQB	X5, X4
  1410  	PCMPEQB	X7, X6
  1411  	PAND	X2, X0
  1412  	PAND	X6, X4
  1413  	PAND	X4, X0
  1414  	PMOVMSKB X0, DX
  1415  	ADDL	$64, SI
  1416  	ADDL	$64, DI
  1417  	SUBL	$64, BX
  1418  	CMPL	DX, $0xffff
  1419  	JEQ	hugeloop
  1420  	MOVB	$0, (AX)
  1421  	RET
  1422  
  1423  	// 4 bytes at a time using 32-bit register
  1424  bigloop:
  1425  	CMPL	BX, $4
  1426  	JBE	leftover
  1427  	MOVL	(SI), CX
  1428  	MOVL	(DI), DX
  1429  	ADDL	$4, SI
  1430  	ADDL	$4, DI
  1431  	SUBL	$4, BX
  1432  	CMPL	CX, DX
  1433  	JEQ	bigloop
  1434  	MOVB	$0, (AX)
  1435  	RET
  1436  
  1437  	// remaining 0-4 bytes
  1438  leftover:
  1439  	MOVL	-4(SI)(BX*1), CX
  1440  	MOVL	-4(DI)(BX*1), DX
  1441  	CMPL	CX, DX
  1442  	SETEQ	(AX)
  1443  	RET
  1444  
  1445  small:
  1446  	CMPL	BX, $0
  1447  	JEQ	equal
  1448  
  1449  	LEAL	0(BX*8), CX
  1450  	NEGL	CX
  1451  
  1452  	MOVL	SI, DX
  1453  	CMPB	DX, $0xfc
  1454  	JA	si_high
  1455  
  1456  	// load at SI won't cross a page boundary.
  1457  	MOVL	(SI), SI
  1458  	JMP	si_finish
  1459  si_high:
  1460  	// address ends in 111111xx.  Load up to bytes we want, move to correct position.
  1461  	MOVL	-4(SI)(BX*1), SI
  1462  	SHRL	CX, SI
  1463  si_finish:
  1464  
  1465  	// same for DI.
  1466  	MOVL	DI, DX
  1467  	CMPB	DX, $0xfc
  1468  	JA	di_high
  1469  	MOVL	(DI), DI
  1470  	JMP	di_finish
  1471  di_high:
  1472  	MOVL	-4(DI)(BX*1), DI
  1473  	SHRL	CX, DI
  1474  di_finish:
  1475  
  1476  	SUBL	SI, DI
  1477  	SHLL	CX, DI
  1478  equal:
  1479  	SETEQ	(AX)
  1480  	RET
  1481  
  1482  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1483  	MOVL	s1_base+0(FP), SI
  1484  	MOVL	s1_len+4(FP), BX
  1485  	MOVL	s2_base+8(FP), DI
  1486  	MOVL	s2_len+12(FP), DX
  1487  	LEAL	ret+16(FP), AX
  1488  	JMP	runtime·cmpbody(SB)
  1489  
  1490  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1491  	MOVL	s1+0(FP), SI
  1492  	MOVL	s1+4(FP), BX
  1493  	MOVL	s2+12(FP), DI
  1494  	MOVL	s2+16(FP), DX
  1495  	LEAL	ret+24(FP), AX
  1496  	JMP	runtime·cmpbody(SB)
  1497  
  1498  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1499  	MOVL	s+0(FP), SI
  1500  	MOVL	s_len+4(FP), CX
  1501  	MOVB	c+12(FP), AL
  1502  	MOVL	SI, DI
  1503  	CLD; REPN; SCASB
  1504  	JZ 3(PC)
  1505  	MOVL	$-1, ret+16(FP)
  1506  	RET
  1507  	SUBL	SI, DI
  1508  	SUBL	$1, DI
  1509  	MOVL	DI, ret+16(FP)
  1510  	RET
  1511  
  1512  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1513  	MOVL	s+0(FP), SI
  1514  	MOVL	s_len+4(FP), CX
  1515  	MOVB	c+8(FP), AL
  1516  	MOVL	SI, DI
  1517  	CLD; REPN; SCASB
  1518  	JZ 3(PC)
  1519  	MOVL	$-1, ret+12(FP)
  1520  	RET
  1521  	SUBL	SI, DI
  1522  	SUBL	$1, DI
  1523  	MOVL	DI, ret+12(FP)
  1524  	RET
  1525  
  1526  // input:
  1527  //   SI = a
  1528  //   DI = b
  1529  //   BX = alen
  1530  //   DX = blen
  1531  //   AX = address of return word (set to 1/0/-1)
  1532  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1533  	MOVL	DX, BP
  1534  	SUBL	BX, DX // DX = blen-alen
  1535  	CMOVLGT	BX, BP // BP = min(alen, blen)
  1536  	CMPL	SI, DI
  1537  	JEQ	allsame
  1538  	CMPL	BP, $4
  1539  	JB	small
  1540  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1541  	JE	mediumloop
  1542  largeloop:
  1543  	CMPL	BP, $16
  1544  	JB	mediumloop
  1545  	MOVOU	(SI), X0
  1546  	MOVOU	(DI), X1
  1547  	PCMPEQB X0, X1
  1548  	PMOVMSKB X1, BX
  1549  	XORL	$0xffff, BX	// convert EQ to NE
  1550  	JNE	diff16	// branch if at least one byte is not equal
  1551  	ADDL	$16, SI
  1552  	ADDL	$16, DI
  1553  	SUBL	$16, BP
  1554  	JMP	largeloop
  1555  
  1556  diff16:
  1557  	BSFL	BX, BX	// index of first byte that differs
  1558  	XORL	DX, DX
  1559  	MOVB	(SI)(BX*1), CX
  1560  	CMPB	CX, (DI)(BX*1)
  1561  	SETHI	DX
  1562  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1563  	MOVL	DX, (AX)
  1564  	RET
  1565  
  1566  mediumloop:
  1567  	CMPL	BP, $4
  1568  	JBE	_0through4
  1569  	MOVL	(SI), BX
  1570  	MOVL	(DI), CX
  1571  	CMPL	BX, CX
  1572  	JNE	diff4
  1573  	ADDL	$4, SI
  1574  	ADDL	$4, DI
  1575  	SUBL	$4, BP
  1576  	JMP	mediumloop
  1577  
  1578  _0through4:
  1579  	MOVL	-4(SI)(BP*1), BX
  1580  	MOVL	-4(DI)(BP*1), CX
  1581  	CMPL	BX, CX
  1582  	JEQ	allsame
  1583  
  1584  diff4:
  1585  	BSWAPL	BX	// reverse order of bytes
  1586  	BSWAPL	CX
  1587  	XORL	BX, CX	// find bit differences
  1588  	BSRL	CX, CX	// index of highest bit difference
  1589  	SHRL	CX, BX	// move a's bit to bottom
  1590  	ANDL	$1, BX	// mask bit
  1591  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1592  	MOVL	BX, (AX)
  1593  	RET
  1594  
  1595  	// 0-3 bytes in common
  1596  small:
  1597  	LEAL	(BP*8), CX
  1598  	NEGL	CX
  1599  	JEQ	allsame
  1600  
  1601  	// load si
  1602  	CMPB	SI, $0xfc
  1603  	JA	si_high
  1604  	MOVL	(SI), SI
  1605  	JMP	si_finish
  1606  si_high:
  1607  	MOVL	-4(SI)(BP*1), SI
  1608  	SHRL	CX, SI
  1609  si_finish:
  1610  	SHLL	CX, SI
  1611  
  1612  	// same for di
  1613  	CMPB	DI, $0xfc
  1614  	JA	di_high
  1615  	MOVL	(DI), DI
  1616  	JMP	di_finish
  1617  di_high:
  1618  	MOVL	-4(DI)(BP*1), DI
  1619  	SHRL	CX, DI
  1620  di_finish:
  1621  	SHLL	CX, DI
  1622  
  1623  	BSWAPL	SI	// reverse order of bytes
  1624  	BSWAPL	DI
  1625  	XORL	SI, DI	// find bit differences
  1626  	JEQ	allsame
  1627  	BSRL	DI, CX	// index of highest bit difference
  1628  	SHRL	CX, SI	// move a's bit to bottom
  1629  	ANDL	$1, SI	// mask bit
  1630  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1631  	MOVL	BX, (AX)
  1632  	RET
  1633  
  1634  	// all the bytes in common are the same, so we just need
  1635  	// to compare the lengths.
  1636  allsame:
  1637  	XORL	BX, BX
  1638  	XORL	CX, CX
  1639  	TESTL	DX, DX
  1640  	SETLT	BX	// 1 if alen > blen
  1641  	SETEQ	CX	// 1 if alen == blen
  1642  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1643  	MOVL	BX, (AX)
  1644  	RET
  1645  
  1646  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1647  	get_tls(CX)
  1648  	MOVL	g(CX), AX
  1649  	MOVL	g_m(AX), AX
  1650  	MOVL	m_fastrand(AX), DX
  1651  	ADDL	DX, DX
  1652  	MOVL	DX, BX
  1653  	XORL	$0x88888eef, DX
  1654  	CMOVLMI	BX, DX
  1655  	MOVL	DX, m_fastrand(AX)
  1656  	MOVL	DX, ret+0(FP)
  1657  	RET
  1658  
  1659  TEXT runtime·return0(SB), NOSPLIT, $0
  1660  	MOVL	$0, AX
  1661  	RET
  1662  
  1663  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1664  // Must obey the gcc calling convention.
  1665  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1666  	get_tls(CX)
  1667  	MOVL	g(CX), AX
  1668  	MOVL	g_m(AX), AX
  1669  	MOVL	m_curg(AX), AX
  1670  	MOVL	(g_stack+stack_hi)(AX), AX
  1671  	RET
  1672  
  1673  // The top-most function running on a goroutine
  1674  // returns to goexit+PCQuantum.
  1675  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1676  	BYTE	$0x90	// NOP
  1677  	CALL	runtime·goexit1(SB)	// does not return
  1678  	// traceback from goexit1 must hit code range of goexit
  1679  	BYTE	$0x90	// NOP
  1680  
  1681  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
  1682  	MOVL	addr+0(FP), AX
  1683  	PREFETCHT0	(AX)
  1684  	RET
  1685  
  1686  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
  1687  	MOVL	addr+0(FP), AX
  1688  	PREFETCHT1	(AX)
  1689  	RET
  1690  
  1691  
  1692  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
  1693  	MOVL	addr+0(FP), AX
  1694  	PREFETCHT2	(AX)
  1695  	RET
  1696  
  1697  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
  1698  	MOVL	addr+0(FP), AX
  1699  	PREFETCHNTA	(AX)
  1700  	RET