github.com/mdempsky/go@v0.0.0-20151201204031-5dd372bd1e70/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	SUBL	$128, SP		// plenty of scratch
    15  	ANDL	$~15, SP
    16  	MOVL	AX, 120(SP)		// save argc, argv away
    17  	MOVL	BX, 124(SP)
    18  
    19  	// set default stack bounds.
    20  	// _cgo_init may update stackguard.
    21  	MOVL	$runtime·g0(SB), BP
    22  	LEAL	(-64*1024+104)(SP), BX
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	BX, g_stackguard1(BP)
    25  	MOVL	BX, (g_stack+stack_lo)(BP)
    26  	MOVL	SP, (g_stack+stack_hi)(BP)
    27  	
    28  	// find out information about the processor we're on
    29  	MOVL	$0, AX
    30  	CPUID
    31  	CMPL	AX, $0
    32  	JE	nocpuinfo
    33  
    34  	// Figure out how to serialize RDTSC.
    35  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    36  	// Don't know about the rest, so let's do MFENCE.
    37  	CMPL	BX, $0x756E6547  // "Genu"
    38  	JNE	notintel
    39  	CMPL	DX, $0x49656E69  // "ineI"
    40  	JNE	notintel
    41  	CMPL	CX, $0x6C65746E  // "ntel"
    42  	JNE	notintel
    43  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    44  notintel:
    45  
    46  	MOVL	$1, AX
    47  	CPUID
    48  	MOVL	CX, AX // Move to global variable clobbers CX when generating PIC
    49  	MOVL	AX, runtime·cpuid_ecx(SB)
    50  	MOVL	DX, runtime·cpuid_edx(SB)
    51  nocpuinfo:	
    52  
    53  	// if there is an _cgo_init, call it to let it
    54  	// initialize and to set up GS.  if not,
    55  	// we set up GS ourselves.
    56  	MOVL	_cgo_init(SB), AX
    57  	TESTL	AX, AX
    58  	JZ	needtls
    59  	MOVL	$setg_gcc<>(SB), BX
    60  	MOVL	BX, 4(SP)
    61  	MOVL	BP, 0(SP)
    62  	CALL	AX
    63  
    64  	// update stackguard after _cgo_init
    65  	MOVL	$runtime·g0(SB), CX
    66  	MOVL	(g_stack+stack_lo)(CX), AX
    67  	ADDL	$const__StackGuard, AX
    68  	MOVL	AX, g_stackguard0(CX)
    69  	MOVL	AX, g_stackguard1(CX)
    70  
    71  #ifndef GOOS_windows
    72  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
    73  	JMP ok
    74  #endif
    75  needtls:
    76  #ifdef GOOS_plan9
    77  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
    78  	JMP	ok
    79  #endif
    80  
    81  	// set up %gs
    82  	CALL	runtime·ldt0setup(SB)
    83  
    84  	// store through it, to make sure it works
    85  	get_tls(BX)
    86  	MOVL	$0x123, g(BX)
    87  	MOVL	runtime·m0+m_tls(SB), AX
    88  	CMPL	AX, $0x123
    89  	JEQ	ok
    90  	MOVL	AX, 0	// abort
    91  ok:
    92  	// set up m and g "registers"
    93  	get_tls(BX)
    94  	LEAL	runtime·g0(SB), DX
    95  	MOVL	DX, g(BX)
    96  	LEAL	runtime·m0(SB), AX
    97  
    98  	// save m->g0 = g0
    99  	MOVL	DX, m_g0(AX)
   100  	// save g0->m = m0
   101  	MOVL	AX, g_m(DX)
   102  
   103  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   104  
   105  	// convention is D is always cleared
   106  	CLD
   107  
   108  	CALL	runtime·check(SB)
   109  
   110  	// saved argc, argv
   111  	MOVL	120(SP), AX
   112  	MOVL	AX, 0(SP)
   113  	MOVL	124(SP), AX
   114  	MOVL	AX, 4(SP)
   115  	CALL	runtime·args(SB)
   116  	CALL	runtime·osinit(SB)
   117  	CALL	runtime·schedinit(SB)
   118  
   119  	// create a new goroutine to start program
   120  	PUSHL	$runtime·mainPC(SB)	// entry
   121  	PUSHL	$0	// arg size
   122  	CALL	runtime·newproc(SB)
   123  	POPL	AX
   124  	POPL	AX
   125  
   126  	// start this M
   127  	CALL	runtime·mstart(SB)
   128  
   129  	INT $3
   130  	RET
   131  
   132  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   133  GLOBL	runtime·mainPC(SB),RODATA,$4
   134  
   135  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   136  	INT $3
   137  	RET
   138  
   139  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   140  	// Linux and MinGW start the FPU in extended double precision.
   141  	// Other operating systems use double precision.
   142  	// Change to double precision to match them,
   143  	// and to match other hardware that only has double.
   144  	PUSHL $0x27F
   145  	FLDCW	0(SP)
   146  	POPL AX
   147  	RET
   148  
   149  /*
   150   *  go-routine
   151   */
   152  
   153  // void gosave(Gobuf*)
   154  // save state in Gobuf; setjmp
   155  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   156  	MOVL	buf+0(FP), AX		// gobuf
   157  	LEAL	buf+0(FP), BX		// caller's SP
   158  	MOVL	BX, gobuf_sp(AX)
   159  	MOVL	0(SP), BX		// caller's PC
   160  	MOVL	BX, gobuf_pc(AX)
   161  	MOVL	$0, gobuf_ret(AX)
   162  	MOVL	$0, gobuf_ctxt(AX)
   163  	get_tls(CX)
   164  	MOVL	g(CX), BX
   165  	MOVL	BX, gobuf_g(AX)
   166  	RET
   167  
   168  // void gogo(Gobuf*)
   169  // restore state from Gobuf; longjmp
   170  TEXT runtime·gogo(SB), NOSPLIT, $0-4
   171  	MOVL	buf+0(FP), BX		// gobuf
   172  	MOVL	gobuf_g(BX), DX
   173  	MOVL	0(DX), CX		// make sure g != nil
   174  	get_tls(CX)
   175  	MOVL	DX, g(CX)
   176  	MOVL	gobuf_sp(BX), SP	// restore SP
   177  	MOVL	gobuf_ret(BX), AX
   178  	MOVL	gobuf_ctxt(BX), DX
   179  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   180  	MOVL	$0, gobuf_ret(BX)
   181  	MOVL	$0, gobuf_ctxt(BX)
   182  	MOVL	gobuf_pc(BX), BX
   183  	JMP	BX
   184  
   185  // func mcall(fn func(*g))
   186  // Switch to m->g0's stack, call fn(g).
   187  // Fn must never return.  It should gogo(&g->sched)
   188  // to keep running g.
   189  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   190  	MOVL	fn+0(FP), DI
   191  
   192  	get_tls(DX)
   193  	MOVL	g(DX), AX	// save state in g->sched
   194  	MOVL	0(SP), BX	// caller's PC
   195  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   196  	LEAL	fn+0(FP), BX	// caller's SP
   197  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   198  	MOVL	AX, (g_sched+gobuf_g)(AX)
   199  
   200  	// switch to m->g0 & its stack, call fn
   201  	MOVL	g(DX), BX
   202  	MOVL	g_m(BX), BX
   203  	MOVL	m_g0(BX), SI
   204  	CMPL	SI, AX	// if g == m->g0 call badmcall
   205  	JNE	3(PC)
   206  	MOVL	$runtime·badmcall(SB), AX
   207  	JMP	AX
   208  	MOVL	SI, g(DX)	// g = m->g0
   209  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   210  	PUSHL	AX
   211  	MOVL	DI, DX
   212  	MOVL	0(DI), DI
   213  	CALL	DI
   214  	POPL	AX
   215  	MOVL	$runtime·badmcall2(SB), AX
   216  	JMP	AX
   217  	RET
   218  
   219  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   220  // of the G stack.  We need to distinguish the routine that
   221  // lives at the bottom of the G stack from the one that lives
   222  // at the top of the system stack because the one at the top of
   223  // the system stack terminates the stack walk (see topofstack()).
   224  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   225  	RET
   226  
   227  // func systemstack(fn func())
   228  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   229  	MOVL	fn+0(FP), DI	// DI = fn
   230  	get_tls(CX)
   231  	MOVL	g(CX), AX	// AX = g
   232  	MOVL	g_m(AX), BX	// BX = m
   233  
   234  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   235  	CMPL	AX, DX
   236  	JEQ	noswitch
   237  
   238  	MOVL	m_g0(BX), DX	// DX = g0
   239  	CMPL	AX, DX
   240  	JEQ	noswitch
   241  
   242  	MOVL	m_curg(BX), BP
   243  	CMPL	AX, BP
   244  	JEQ	switch
   245  	
   246  	// Bad: g is not gsignal, not g0, not curg. What is it?
   247  	// Hide call from linker nosplit analysis.
   248  	MOVL	$runtime·badsystemstack(SB), AX
   249  	CALL	AX
   250  
   251  switch:
   252  	// save our state in g->sched.  Pretend to
   253  	// be systemstack_switch if the G stack is scanned.
   254  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   255  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   256  	MOVL	AX, (g_sched+gobuf_g)(AX)
   257  
   258  	// switch to g0
   259  	get_tls(CX)
   260  	MOVL	DX, g(CX)
   261  	MOVL	(g_sched+gobuf_sp)(DX), BX
   262  	// make it look like mstart called systemstack on g0, to stop traceback
   263  	SUBL	$4, BX
   264  	MOVL	$runtime·mstart(SB), DX
   265  	MOVL	DX, 0(BX)
   266  	MOVL	BX, SP
   267  
   268  	// call target function
   269  	MOVL	DI, DX
   270  	MOVL	0(DI), DI
   271  	CALL	DI
   272  
   273  	// switch back to g
   274  	get_tls(CX)
   275  	MOVL	g(CX), AX
   276  	MOVL	g_m(AX), BX
   277  	MOVL	m_curg(BX), AX
   278  	MOVL	AX, g(CX)
   279  	MOVL	(g_sched+gobuf_sp)(AX), SP
   280  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   281  	RET
   282  
   283  noswitch:
   284  	// already on system stack, just call directly
   285  	MOVL	DI, DX
   286  	MOVL	0(DI), DI
   287  	CALL	DI
   288  	RET
   289  
   290  /*
   291   * support for morestack
   292   */
   293  
   294  // Called during function prolog when more stack is needed.
   295  //
   296  // The traceback routines see morestack on a g0 as being
   297  // the top of a stack (for example, morestack calling newstack
   298  // calling the scheduler calling newm calling gc), so we must
   299  // record an argument size. For that purpose, it has no arguments.
   300  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   301  	// Cannot grow scheduler stack (m->g0).
   302  	get_tls(CX)
   303  	MOVL	g(CX), BX
   304  	MOVL	g_m(BX), BX
   305  	MOVL	m_g0(BX), SI
   306  	CMPL	g(CX), SI
   307  	JNE	2(PC)
   308  	INT	$3
   309  
   310  	// Cannot grow signal stack.
   311  	MOVL	m_gsignal(BX), SI
   312  	CMPL	g(CX), SI
   313  	JNE	2(PC)
   314  	INT	$3
   315  
   316  	// Called from f.
   317  	// Set m->morebuf to f's caller.
   318  	MOVL	4(SP), DI	// f's caller's PC
   319  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   320  	LEAL	8(SP), CX	// f's caller's SP
   321  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   322  	get_tls(CX)
   323  	MOVL	g(CX), SI
   324  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   325  
   326  	// Set g->sched to context in f.
   327  	MOVL	0(SP), AX	// f's PC
   328  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   329  	MOVL	SI, (g_sched+gobuf_g)(SI)
   330  	LEAL	4(SP), AX	// f's SP
   331  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   332  	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
   333  
   334  	// Call newstack on m->g0's stack.
   335  	MOVL	m_g0(BX), BP
   336  	MOVL	BP, g(CX)
   337  	MOVL	(g_sched+gobuf_sp)(BP), AX
   338  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   339  	MOVL	AX, SP
   340  	CALL	runtime·newstack(SB)
   341  	MOVL	$0, 0x1003	// crash if newstack returns
   342  	RET
   343  
   344  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   345  	MOVL	$0, DX
   346  	JMP runtime·morestack(SB)
   347  
   348  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   349  	// We came here via a RET to an overwritten return PC.
   350  	// AX may be live. Other registers are available.
   351  
   352  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   353  	get_tls(CX)
   354  	MOVL	g(CX), CX
   355  	MOVL	(g_stkbar+slice_array)(CX), DX
   356  	MOVL	g_stkbarPos(CX), BX
   357  	IMULL	$stkbar__size, BX	// Too big for SIB.
   358  	MOVL	stkbar_savedLRVal(DX)(BX*1), BX
   359  	// Record that this stack barrier was hit.
   360  	ADDL	$1, g_stkbarPos(CX)
   361  	// Jump to the original return PC.
   362  	JMP	BX
   363  
   364  // reflectcall: call a function with the given argument list
   365  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   366  // we don't have variable-sized frames, so we use a small number
   367  // of constant-sized-frame functions to encode a few bits of size in the pc.
   368  // Caution: ugly multiline assembly macros in your future!
   369  
   370  #define DISPATCH(NAME,MAXSIZE)		\
   371  	CMPL	CX, $MAXSIZE;		\
   372  	JA	3(PC);			\
   373  	MOVL	$NAME(SB), AX;		\
   374  	JMP	AX
   375  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   376  
   377  TEXT reflect·call(SB), NOSPLIT, $0-0
   378  	JMP	·reflectcall(SB)
   379  
   380  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   381  	MOVL	argsize+12(FP), CX
   382  	DISPATCH(runtime·call16, 16)
   383  	DISPATCH(runtime·call32, 32)
   384  	DISPATCH(runtime·call64, 64)
   385  	DISPATCH(runtime·call128, 128)
   386  	DISPATCH(runtime·call256, 256)
   387  	DISPATCH(runtime·call512, 512)
   388  	DISPATCH(runtime·call1024, 1024)
   389  	DISPATCH(runtime·call2048, 2048)
   390  	DISPATCH(runtime·call4096, 4096)
   391  	DISPATCH(runtime·call8192, 8192)
   392  	DISPATCH(runtime·call16384, 16384)
   393  	DISPATCH(runtime·call32768, 32768)
   394  	DISPATCH(runtime·call65536, 65536)
   395  	DISPATCH(runtime·call131072, 131072)
   396  	DISPATCH(runtime·call262144, 262144)
   397  	DISPATCH(runtime·call524288, 524288)
   398  	DISPATCH(runtime·call1048576, 1048576)
   399  	DISPATCH(runtime·call2097152, 2097152)
   400  	DISPATCH(runtime·call4194304, 4194304)
   401  	DISPATCH(runtime·call8388608, 8388608)
   402  	DISPATCH(runtime·call16777216, 16777216)
   403  	DISPATCH(runtime·call33554432, 33554432)
   404  	DISPATCH(runtime·call67108864, 67108864)
   405  	DISPATCH(runtime·call134217728, 134217728)
   406  	DISPATCH(runtime·call268435456, 268435456)
   407  	DISPATCH(runtime·call536870912, 536870912)
   408  	DISPATCH(runtime·call1073741824, 1073741824)
   409  	MOVL	$runtime·badreflectcall(SB), AX
   410  	JMP	AX
   411  
   412  #define CALLFN(NAME,MAXSIZE)			\
   413  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   414  	NO_LOCAL_POINTERS;			\
   415  	/* copy arguments to stack */		\
   416  	MOVL	argptr+8(FP), SI;		\
   417  	MOVL	argsize+12(FP), CX;		\
   418  	MOVL	SP, DI;				\
   419  	REP;MOVSB;				\
   420  	/* call function */			\
   421  	MOVL	f+4(FP), DX;			\
   422  	MOVL	(DX), AX; 			\
   423  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   424  	CALL	AX;				\
   425  	/* copy return values back */		\
   426  	MOVL	argptr+8(FP), DI;		\
   427  	MOVL	argsize+12(FP), CX;		\
   428  	MOVL	retoffset+16(FP), BX;		\
   429  	MOVL	SP, SI;				\
   430  	ADDL	BX, DI;				\
   431  	ADDL	BX, SI;				\
   432  	SUBL	BX, CX;				\
   433  	REP;MOVSB;				\
   434  	/* execute write barrier updates */	\
   435  	MOVL	argtype+0(FP), DX;		\
   436  	MOVL	argptr+8(FP), DI;		\
   437  	MOVL	argsize+12(FP), CX;		\
   438  	MOVL	retoffset+16(FP), BX;		\
   439  	MOVL	DX, 0(SP);			\
   440  	MOVL	DI, 4(SP);			\
   441  	MOVL	CX, 8(SP);			\
   442  	MOVL	BX, 12(SP);			\
   443  	CALL	runtime·callwritebarrier(SB);	\
   444  	RET
   445  
   446  CALLFN(·call16, 16)
   447  CALLFN(·call32, 32)
   448  CALLFN(·call64, 64)
   449  CALLFN(·call128, 128)
   450  CALLFN(·call256, 256)
   451  CALLFN(·call512, 512)
   452  CALLFN(·call1024, 1024)
   453  CALLFN(·call2048, 2048)
   454  CALLFN(·call4096, 4096)
   455  CALLFN(·call8192, 8192)
   456  CALLFN(·call16384, 16384)
   457  CALLFN(·call32768, 32768)
   458  CALLFN(·call65536, 65536)
   459  CALLFN(·call131072, 131072)
   460  CALLFN(·call262144, 262144)
   461  CALLFN(·call524288, 524288)
   462  CALLFN(·call1048576, 1048576)
   463  CALLFN(·call2097152, 2097152)
   464  CALLFN(·call4194304, 4194304)
   465  CALLFN(·call8388608, 8388608)
   466  CALLFN(·call16777216, 16777216)
   467  CALLFN(·call33554432, 33554432)
   468  CALLFN(·call67108864, 67108864)
   469  CALLFN(·call134217728, 134217728)
   470  CALLFN(·call268435456, 268435456)
   471  CALLFN(·call536870912, 536870912)
   472  CALLFN(·call1073741824, 1073741824)
   473  
   474  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   475  	MOVL	cycles+0(FP), AX
   476  again:
   477  	PAUSE
   478  	SUBL	$1, AX
   479  	JNZ	again
   480  	RET
   481  
   482  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   483  	// Stores are already ordered on x86, so this is just a
   484  	// compile barrier.
   485  	RET
   486  
   487  // void jmpdefer(fn, sp);
   488  // called from deferreturn.
   489  // 1. pop the caller
   490  // 2. sub 5 bytes from the callers return
   491  // 3. jmp to the argument
   492  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   493  	MOVL	fv+0(FP), DX	// fn
   494  	MOVL	argp+4(FP), BX	// caller sp
   495  	LEAL	-4(BX), SP	// caller sp after CALL
   496  	SUBL	$5, (SP)	// return to CALL again
   497  	MOVL	0(DX), BX
   498  	JMP	BX	// but first run the deferred function
   499  
   500  // Save state of caller into g->sched.
   501  TEXT gosave<>(SB),NOSPLIT,$0
   502  	PUSHL	AX
   503  	PUSHL	BX
   504  	get_tls(BX)
   505  	MOVL	g(BX), BX
   506  	LEAL	arg+0(FP), AX
   507  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   508  	MOVL	-4(AX), AX
   509  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   510  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   511  	MOVL	$0, (g_sched+gobuf_ctxt)(BX)
   512  	POPL	BX
   513  	POPL	AX
   514  	RET
   515  
   516  // func asmcgocall(fn, arg unsafe.Pointer) int32
   517  // Call fn(arg) on the scheduler stack,
   518  // aligned appropriately for the gcc ABI.
   519  // See cgocall.go for more details.
   520  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   521  	MOVL	fn+0(FP), AX
   522  	MOVL	arg+4(FP), BX
   523  
   524  	MOVL	SP, DX
   525  
   526  	// Figure out if we need to switch to m->g0 stack.
   527  	// We get called to create new OS threads too, and those
   528  	// come in on the m->g0 stack already.
   529  	get_tls(CX)
   530  	MOVL	g(CX), BP
   531  	MOVL	g_m(BP), BP
   532  	MOVL	m_g0(BP), SI
   533  	MOVL	g(CX), DI
   534  	CMPL	SI, DI
   535  	JEQ	noswitch
   536  	CALL	gosave<>(SB)
   537  	get_tls(CX)
   538  	MOVL	SI, g(CX)
   539  	MOVL	(g_sched+gobuf_sp)(SI), SP
   540  
   541  noswitch:
   542  	// Now on a scheduling stack (a pthread-created stack).
   543  	SUBL	$32, SP
   544  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   545  	MOVL	DI, 8(SP)	// save g
   546  	MOVL	(g_stack+stack_hi)(DI), DI
   547  	SUBL	DX, DI
   548  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   549  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   550  	CALL	AX
   551  
   552  	// Restore registers, g, stack pointer.
   553  	get_tls(CX)
   554  	MOVL	8(SP), DI
   555  	MOVL	(g_stack+stack_hi)(DI), SI
   556  	SUBL	4(SP), SI
   557  	MOVL	DI, g(CX)
   558  	MOVL	SI, SP
   559  
   560  	MOVL	AX, ret+8(FP)
   561  	RET
   562  
   563  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   564  // Turn the fn into a Go func (by taking its address) and call
   565  // cgocallback_gofunc.
   566  TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
   567  	LEAL	fn+0(FP), AX
   568  	MOVL	AX, 0(SP)
   569  	MOVL	frame+4(FP), AX
   570  	MOVL	AX, 4(SP)
   571  	MOVL	framesize+8(FP), AX
   572  	MOVL	AX, 8(SP)
   573  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   574  	CALL	AX
   575  	RET
   576  
   577  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   578  // See cgocall.go for more details.
   579  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-12
   580  	NO_LOCAL_POINTERS
   581  
   582  	// If g is nil, Go did not create the current thread.
   583  	// Call needm to obtain one for temporary use.
   584  	// In this case, we're running on the thread stack, so there's
   585  	// lots of space, but the linker doesn't know. Hide the call from
   586  	// the linker analysis by using an indirect call through AX.
   587  	get_tls(CX)
   588  #ifdef GOOS_windows
   589  	MOVL	$0, BP
   590  	CMPL	CX, $0
   591  	JEQ	2(PC) // TODO
   592  #endif
   593  	MOVL	g(CX), BP
   594  	CMPL	BP, $0
   595  	JEQ	needm
   596  	MOVL	g_m(BP), BP
   597  	MOVL	BP, DX // saved copy of oldm
   598  	JMP	havem
   599  needm:
   600  	MOVL	$0, 0(SP)
   601  	MOVL	$runtime·needm(SB), AX
   602  	CALL	AX
   603  	MOVL	0(SP), DX
   604  	get_tls(CX)
   605  	MOVL	g(CX), BP
   606  	MOVL	g_m(BP), BP
   607  
   608  	// Set m->sched.sp = SP, so that if a panic happens
   609  	// during the function we are about to execute, it will
   610  	// have a valid SP to run on the g0 stack.
   611  	// The next few lines (after the havem label)
   612  	// will save this SP onto the stack and then write
   613  	// the same SP back to m->sched.sp. That seems redundant,
   614  	// but if an unrecovered panic happens, unwindm will
   615  	// restore the g->sched.sp from the stack location
   616  	// and then systemstack will try to use it. If we don't set it here,
   617  	// that restored SP will be uninitialized (typically 0) and
   618  	// will not be usable.
   619  	MOVL	m_g0(BP), SI
   620  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   621  
   622  havem:
   623  	// Now there's a valid m, and we're running on its m->g0.
   624  	// Save current m->g0->sched.sp on stack and then set it to SP.
   625  	// Save current sp in m->g0->sched.sp in preparation for
   626  	// switch back to m->curg stack.
   627  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   628  	MOVL	m_g0(BP), SI
   629  	MOVL	(g_sched+gobuf_sp)(SI), AX
   630  	MOVL	AX, 0(SP)
   631  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   632  
   633  	// Switch to m->curg stack and call runtime.cgocallbackg.
   634  	// Because we are taking over the execution of m->curg
   635  	// but *not* resuming what had been running, we need to
   636  	// save that information (m->curg->sched) so we can restore it.
   637  	// We can restore m->curg->sched.sp easily, because calling
   638  	// runtime.cgocallbackg leaves SP unchanged upon return.
   639  	// To save m->curg->sched.pc, we push it onto the stack.
   640  	// This has the added benefit that it looks to the traceback
   641  	// routine like cgocallbackg is going to return to that
   642  	// PC (because the frame we allocate below has the same
   643  	// size as cgocallback_gofunc's frame declared above)
   644  	// so that the traceback will seamlessly trace back into
   645  	// the earlier calls.
   646  	//
   647  	// In the new goroutine, 0(SP) holds the saved oldm (DX) register.
   648  	// 4(SP) and 8(SP) are unused.
   649  	MOVL	m_curg(BP), SI
   650  	MOVL	SI, g(CX)
   651  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   652  	MOVL	(g_sched+gobuf_pc)(SI), BP
   653  	MOVL	BP, -4(DI)
   654  	LEAL	-(4+12)(DI), SP
   655  	MOVL	DX, 0(SP)
   656  	CALL	runtime·cgocallbackg(SB)
   657  	MOVL	0(SP), DX
   658  
   659  	// Restore g->sched (== m->curg->sched) from saved values.
   660  	get_tls(CX)
   661  	MOVL	g(CX), SI
   662  	MOVL	12(SP), BP
   663  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   664  	LEAL	(12+4)(SP), DI
   665  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   666  
   667  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   668  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   669  	// so we do not have to restore it.)
   670  	MOVL	g(CX), BP
   671  	MOVL	g_m(BP), BP
   672  	MOVL	m_g0(BP), SI
   673  	MOVL	SI, g(CX)
   674  	MOVL	(g_sched+gobuf_sp)(SI), SP
   675  	MOVL	0(SP), AX
   676  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   677  	
   678  	// If the m on entry was nil, we called needm above to borrow an m
   679  	// for the duration of the call. Since the call is over, return it with dropm.
   680  	CMPL	DX, $0
   681  	JNE 3(PC)
   682  	MOVL	$runtime·dropm(SB), AX
   683  	CALL	AX
   684  
   685  	// Done!
   686  	RET
   687  
   688  // void setg(G*); set g. for use by needm.
   689  TEXT runtime·setg(SB), NOSPLIT, $0-4
   690  	MOVL	gg+0(FP), BX
   691  #ifdef GOOS_windows
   692  	CMPL	BX, $0
   693  	JNE	settls
   694  	MOVL	$0, 0x14(FS)
   695  	RET
   696  settls:
   697  	MOVL	g_m(BX), AX
   698  	LEAL	m_tls(AX), AX
   699  	MOVL	AX, 0x14(FS)
   700  #endif
   701  	get_tls(CX)
   702  	MOVL	BX, g(CX)
   703  	RET
   704  
   705  // void setg_gcc(G*); set g. for use by gcc
   706  TEXT setg_gcc<>(SB), NOSPLIT, $0
   707  	get_tls(AX)
   708  	MOVL	gg+0(FP), DX
   709  	MOVL	DX, g(AX)
   710  	RET
   711  
   712  // check that SP is in range [g->stack.lo, g->stack.hi)
   713  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   714  	get_tls(CX)
   715  	MOVL	g(CX), AX
   716  	CMPL	(g_stack+stack_hi)(AX), SP
   717  	JHI	2(PC)
   718  	INT	$3
   719  	CMPL	SP, (g_stack+stack_lo)(AX)
   720  	JHI	2(PC)
   721  	INT	$3
   722  	RET
   723  
   724  TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
   725  	MOVL	argp+0(FP),AX		// addr of first arg
   726  	MOVL	-4(AX),AX		// get calling pc
   727  	CMPL	AX, runtime·stackBarrierPC(SB)
   728  	JNE	nobar
   729  	// Get original return PC.
   730  	CALL	runtime·nextBarrierPC(SB)
   731  	MOVL	0(SP), AX
   732  nobar:
   733  	MOVL	AX, ret+4(FP)
   734  	RET
   735  
   736  TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
   737  	MOVL	argp+0(FP),AX		// addr of first arg
   738  	MOVL	pc+4(FP), BX
   739  	MOVL	-4(AX), DX
   740  	CMPL	DX, runtime·stackBarrierPC(SB)
   741  	JEQ	setbar
   742  	MOVL	BX, -4(AX)		// set calling pc
   743  	RET
   744  setbar:
   745  	// Set the stack barrier return PC.
   746  	MOVL	BX, 0(SP)
   747  	CALL	runtime·setNextBarrierPC(SB)
   748  	RET
   749  
   750  TEXT runtime·getcallersp(SB), NOSPLIT, $0-8
   751  	MOVL	argp+0(FP), AX
   752  	MOVL	AX, ret+4(FP)
   753  	RET
   754  
   755  // func cputicks() int64
   756  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   757  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence
   758  	JEQ	done
   759  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   760  	JNE	mfence
   761  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   762  	JMP	done
   763  mfence:
   764  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   765  done:
   766  	RDTSC
   767  	MOVL	AX, ret_lo+0(FP)
   768  	MOVL	DX, ret_hi+4(FP)
   769  	RET
   770  
   771  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   772  	// set up ldt 7 to point at m0.tls
   773  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   774  	// the entry number is just a hint.  setldt will set up GS with what it used.
   775  	MOVL	$7, 0(SP)
   776  	LEAL	runtime·m0+m_tls(SB), AX
   777  	MOVL	AX, 4(SP)
   778  	MOVL	$32, 8(SP)	// sizeof(tls array)
   779  	CALL	runtime·setldt(SB)
   780  	RET
   781  
   782  TEXT runtime·emptyfunc(SB),0,$0-0
   783  	RET
   784  
   785  TEXT runtime·abort(SB),NOSPLIT,$0-0
   786  	INT $0x3
   787  
   788  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   789  // redirects to memhash(p, h, size) using the size
   790  // stored in the closure.
   791  TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
   792  	GO_ARGS
   793  	NO_LOCAL_POINTERS
   794  	MOVL	p+0(FP), AX
   795  	MOVL	h+4(FP), BX
   796  	MOVL	4(DX), CX
   797  	MOVL	AX, 0(SP)
   798  	MOVL	BX, 4(SP)
   799  	MOVL	CX, 8(SP)
   800  	CALL	runtime·memhash(SB)
   801  	MOVL	12(SP), AX
   802  	MOVL	AX, ret+8(FP)
   803  	RET
   804  
   805  // hash function using AES hardware instructions
   806  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   807  	MOVL	p+0(FP), AX	// ptr to data
   808  	MOVL	s+8(FP), BX	// size
   809  	LEAL	ret+12(FP), DX
   810  	JMP	runtime·aeshashbody(SB)
   811  
   812  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   813  	MOVL	p+0(FP), AX	// ptr to string object
   814  	MOVL	4(AX), BX	// length of string
   815  	MOVL	(AX), AX	// string data
   816  	LEAL	ret+8(FP), DX
   817  	JMP	runtime·aeshashbody(SB)
   818  
   819  // AX: data
   820  // BX: length
   821  // DX: address to put return value
   822  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   823  	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
   824  	PINSRW	$4, BX, X0	            // 16 bits of length
   825  	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
   826  	MOVO	X0, X1                      // save unscrambled seed
   827  	PXOR	runtime·aeskeysched(SB), X0 // xor in per-process seed
   828  	AESENC	X0, X0                      // scramble seed
   829  
   830  	CMPL	BX, $16
   831  	JB	aes0to15
   832  	JE	aes16
   833  	CMPL	BX, $32
   834  	JBE	aes17to32
   835  	CMPL	BX, $64
   836  	JBE	aes33to64
   837  	JMP	aes65plus
   838  	
   839  aes0to15:
   840  	TESTL	BX, BX
   841  	JE	aes0
   842  
   843  	ADDL	$16, AX
   844  	TESTW	$0xff0, AX
   845  	JE	endofpage
   846  
   847  	// 16 bytes loaded at this address won't cross
   848  	// a page boundary, so we can load it directly.
   849  	MOVOU	-16(AX), X1
   850  	ADDL	BX, BX
   851  	PAND	masks<>(SB)(BX*8), X1
   852  
   853  final1:	
   854  	AESENC	X0, X1  // scramble input, xor in seed
   855  	AESENC	X1, X1  // scramble combo 2 times
   856  	AESENC	X1, X1
   857  	MOVL	X1, (DX)
   858  	RET
   859  
   860  endofpage:
   861  	// address ends in 1111xxxx.  Might be up against
   862  	// a page boundary, so load ending at last byte.
   863  	// Then shift bytes down using pshufb.
   864  	MOVOU	-32(AX)(BX*1), X1
   865  	ADDL	BX, BX
   866  	PSHUFB	shifts<>(SB)(BX*8), X1
   867  	JMP	final1
   868  
   869  aes0:
   870  	// Return scrambled input seed
   871  	AESENC	X0, X0
   872  	MOVL	X0, (DX)
   873  	RET
   874  
   875  aes16:
   876  	MOVOU	(AX), X1
   877  	JMP	final1
   878  
   879  aes17to32:
   880  	// make second starting seed
   881  	PXOR	runtime·aeskeysched+16(SB), X1
   882  	AESENC	X1, X1
   883  	
   884  	// load data to be hashed
   885  	MOVOU	(AX), X2
   886  	MOVOU	-16(AX)(BX*1), X3
   887  
   888  	// scramble 3 times
   889  	AESENC	X0, X2
   890  	AESENC	X1, X3
   891  	AESENC	X2, X2
   892  	AESENC	X3, X3
   893  	AESENC	X2, X2
   894  	AESENC	X3, X3
   895  
   896  	// combine results
   897  	PXOR	X3, X2
   898  	MOVL	X2, (DX)
   899  	RET
   900  
   901  aes33to64:
   902  	// make 3 more starting seeds
   903  	MOVO	X1, X2
   904  	MOVO	X1, X3
   905  	PXOR	runtime·aeskeysched+16(SB), X1
   906  	PXOR	runtime·aeskeysched+32(SB), X2
   907  	PXOR	runtime·aeskeysched+48(SB), X3
   908  	AESENC	X1, X1
   909  	AESENC	X2, X2
   910  	AESENC	X3, X3
   911  	
   912  	MOVOU	(AX), X4
   913  	MOVOU	16(AX), X5
   914  	MOVOU	-32(AX)(BX*1), X6
   915  	MOVOU	-16(AX)(BX*1), X7
   916  	
   917  	AESENC	X0, X4
   918  	AESENC	X1, X5
   919  	AESENC	X2, X6
   920  	AESENC	X3, X7
   921  	
   922  	AESENC	X4, X4
   923  	AESENC	X5, X5
   924  	AESENC	X6, X6
   925  	AESENC	X7, X7
   926  	
   927  	AESENC	X4, X4
   928  	AESENC	X5, X5
   929  	AESENC	X6, X6
   930  	AESENC	X7, X7
   931  
   932  	PXOR	X6, X4
   933  	PXOR	X7, X5
   934  	PXOR	X5, X4
   935  	MOVL	X4, (DX)
   936  	RET
   937  
   938  aes65plus:
   939  	// make 3 more starting seeds
   940  	MOVO	X1, X2
   941  	MOVO	X1, X3
   942  	PXOR	runtime·aeskeysched+16(SB), X1
   943  	PXOR	runtime·aeskeysched+32(SB), X2
   944  	PXOR	runtime·aeskeysched+48(SB), X3
   945  	AESENC	X1, X1
   946  	AESENC	X2, X2
   947  	AESENC	X3, X3
   948  	
   949  	// start with last (possibly overlapping) block
   950  	MOVOU	-64(AX)(BX*1), X4
   951  	MOVOU	-48(AX)(BX*1), X5
   952  	MOVOU	-32(AX)(BX*1), X6
   953  	MOVOU	-16(AX)(BX*1), X7
   954  
   955  	// scramble state once
   956  	AESENC	X0, X4
   957  	AESENC	X1, X5
   958  	AESENC	X2, X6
   959  	AESENC	X3, X7
   960  
   961  	// compute number of remaining 64-byte blocks
   962  	DECL	BX
   963  	SHRL	$6, BX
   964  	
   965  aesloop:
   966  	// scramble state, xor in a block
   967  	MOVOU	(AX), X0
   968  	MOVOU	16(AX), X1
   969  	MOVOU	32(AX), X2
   970  	MOVOU	48(AX), X3
   971  	AESENC	X0, X4
   972  	AESENC	X1, X5
   973  	AESENC	X2, X6
   974  	AESENC	X3, X7
   975  
   976  	// scramble state
   977  	AESENC	X4, X4
   978  	AESENC	X5, X5
   979  	AESENC	X6, X6
   980  	AESENC	X7, X7
   981  
   982  	ADDL	$64, AX
   983  	DECL	BX
   984  	JNE	aesloop
   985  
   986  	// 2 more scrambles to finish
   987  	AESENC	X4, X4
   988  	AESENC	X5, X5
   989  	AESENC	X6, X6
   990  	AESENC	X7, X7
   991  	
   992  	AESENC	X4, X4
   993  	AESENC	X5, X5
   994  	AESENC	X6, X6
   995  	AESENC	X7, X7
   996  
   997  	PXOR	X6, X4
   998  	PXOR	X7, X5
   999  	PXOR	X5, X4
  1000  	MOVL	X4, (DX)
  1001  	RET
  1002  
  1003  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1004  	MOVL	p+0(FP), AX	// ptr to data
  1005  	MOVL	h+4(FP), X0	// seed
  1006  	PINSRD	$1, (AX), X0	// data
  1007  	AESENC	runtime·aeskeysched+0(SB), X0
  1008  	AESENC	runtime·aeskeysched+16(SB), X0
  1009  	AESENC	runtime·aeskeysched+32(SB), X0
  1010  	MOVL	X0, ret+8(FP)
  1011  	RET
  1012  
  1013  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1014  	MOVL	p+0(FP), AX	// ptr to data
  1015  	MOVQ	(AX), X0	// data
  1016  	PINSRD	$2, h+4(FP), X0	// seed
  1017  	AESENC	runtime·aeskeysched+0(SB), X0
  1018  	AESENC	runtime·aeskeysched+16(SB), X0
  1019  	AESENC	runtime·aeskeysched+32(SB), X0
  1020  	MOVL	X0, ret+8(FP)
  1021  	RET
  1022  
  1023  // simple mask to get rid of data in the high part of the register.
  1024  DATA masks<>+0x00(SB)/4, $0x00000000
  1025  DATA masks<>+0x04(SB)/4, $0x00000000
  1026  DATA masks<>+0x08(SB)/4, $0x00000000
  1027  DATA masks<>+0x0c(SB)/4, $0x00000000
  1028  	
  1029  DATA masks<>+0x10(SB)/4, $0x000000ff
  1030  DATA masks<>+0x14(SB)/4, $0x00000000
  1031  DATA masks<>+0x18(SB)/4, $0x00000000
  1032  DATA masks<>+0x1c(SB)/4, $0x00000000
  1033  	
  1034  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1035  DATA masks<>+0x24(SB)/4, $0x00000000
  1036  DATA masks<>+0x28(SB)/4, $0x00000000
  1037  DATA masks<>+0x2c(SB)/4, $0x00000000
  1038  	
  1039  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1040  DATA masks<>+0x34(SB)/4, $0x00000000
  1041  DATA masks<>+0x38(SB)/4, $0x00000000
  1042  DATA masks<>+0x3c(SB)/4, $0x00000000
  1043  	
  1044  DATA masks<>+0x40(SB)/4, $0xffffffff
  1045  DATA masks<>+0x44(SB)/4, $0x00000000
  1046  DATA masks<>+0x48(SB)/4, $0x00000000
  1047  DATA masks<>+0x4c(SB)/4, $0x00000000
  1048  	
  1049  DATA masks<>+0x50(SB)/4, $0xffffffff
  1050  DATA masks<>+0x54(SB)/4, $0x000000ff
  1051  DATA masks<>+0x58(SB)/4, $0x00000000
  1052  DATA masks<>+0x5c(SB)/4, $0x00000000
  1053  	
  1054  DATA masks<>+0x60(SB)/4, $0xffffffff
  1055  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1056  DATA masks<>+0x68(SB)/4, $0x00000000
  1057  DATA masks<>+0x6c(SB)/4, $0x00000000
  1058  	
  1059  DATA masks<>+0x70(SB)/4, $0xffffffff
  1060  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1061  DATA masks<>+0x78(SB)/4, $0x00000000
  1062  DATA masks<>+0x7c(SB)/4, $0x00000000
  1063  	
  1064  DATA masks<>+0x80(SB)/4, $0xffffffff
  1065  DATA masks<>+0x84(SB)/4, $0xffffffff
  1066  DATA masks<>+0x88(SB)/4, $0x00000000
  1067  DATA masks<>+0x8c(SB)/4, $0x00000000
  1068  	
  1069  DATA masks<>+0x90(SB)/4, $0xffffffff
  1070  DATA masks<>+0x94(SB)/4, $0xffffffff
  1071  DATA masks<>+0x98(SB)/4, $0x000000ff
  1072  DATA masks<>+0x9c(SB)/4, $0x00000000
  1073  	
  1074  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1075  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1076  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1077  DATA masks<>+0xac(SB)/4, $0x00000000
  1078  	
  1079  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1080  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1081  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1082  DATA masks<>+0xbc(SB)/4, $0x00000000
  1083  	
  1084  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1085  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1086  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1087  DATA masks<>+0xcc(SB)/4, $0x00000000
  1088  	
  1089  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1090  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1091  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1092  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1093  	
  1094  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1095  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1096  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1097  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1098  	
  1099  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1100  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1101  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1102  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1103  
  1104  GLOBL masks<>(SB),RODATA,$256
  1105  
  1106  // these are arguments to pshufb.  They move data down from
  1107  // the high bytes of the register to the low bytes of the register.
  1108  // index is how many bytes to move.
  1109  DATA shifts<>+0x00(SB)/4, $0x00000000
  1110  DATA shifts<>+0x04(SB)/4, $0x00000000
  1111  DATA shifts<>+0x08(SB)/4, $0x00000000
  1112  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1113  	
  1114  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1115  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1116  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1117  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1118  	
  1119  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1120  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1121  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1122  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1123  	
  1124  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1125  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1126  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1127  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1128  	
  1129  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1130  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1131  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1132  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1133  	
  1134  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1135  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1136  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1137  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1138  	
  1139  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1140  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1141  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1142  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1143  	
  1144  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1145  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1146  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1147  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1148  	
  1149  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1150  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1151  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1152  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1153  	
  1154  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1155  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1156  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1157  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1158  	
  1159  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1160  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1161  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1162  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1163  	
  1164  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1165  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1166  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1167  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1168  	
  1169  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1170  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1171  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1172  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1173  	
  1174  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1175  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1176  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1177  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1178  	
  1179  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1180  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1181  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1182  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1183  	
  1184  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1185  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1186  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1187  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1188  
  1189  GLOBL shifts<>(SB),RODATA,$256
  1190  
  1191  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1192  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1193  	MOVL	$masks<>(SB), AX
  1194  	MOVL	$shifts<>(SB), BX
  1195  	ORL	BX, AX
  1196  	TESTL	$15, AX
  1197  	SETEQ	ret+0(FP)
  1198  	RET
  1199  
  1200  TEXT runtime·memeq(SB),NOSPLIT,$0-13
  1201  	MOVL	a+0(FP), SI
  1202  	MOVL	b+4(FP), DI
  1203  	MOVL	size+8(FP), BX
  1204  	LEAL	ret+12(FP), AX
  1205  	JMP	runtime·memeqbody(SB)
  1206  
  1207  // memequal_varlen(a, b unsafe.Pointer) bool
  1208  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1209  	MOVL    a+0(FP), SI
  1210  	MOVL    b+4(FP), DI
  1211  	CMPL    SI, DI
  1212  	JEQ     eq
  1213  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1214  	LEAL	ret+8(FP), AX
  1215  	JMP	runtime·memeqbody(SB)
  1216  eq:
  1217  	MOVB    $1, ret+8(FP)
  1218  	RET
  1219  
  1220  // eqstring tests whether two strings are equal.
  1221  // The compiler guarantees that strings passed
  1222  // to eqstring have equal length.
  1223  // See runtime_test.go:eqstring_generic for
  1224  // equivalent Go code.
  1225  TEXT runtime·eqstring(SB),NOSPLIT,$0-17
  1226  	MOVL	s1str+0(FP), SI
  1227  	MOVL	s2str+8(FP), DI
  1228  	CMPL	SI, DI
  1229  	JEQ	same
  1230  	MOVL	s1len+4(FP), BX
  1231  	LEAL	v+16(FP), AX
  1232  	JMP	runtime·memeqbody(SB)
  1233  same:
  1234  	MOVB	$1, v+16(FP)
  1235  	RET
  1236  
  1237  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1238  	MOVL	a_len+4(FP), BX
  1239  	MOVL	b_len+16(FP), CX
  1240  	CMPL	BX, CX
  1241  	JNE	eqret
  1242  	MOVL	a+0(FP), SI
  1243  	MOVL	b+12(FP), DI
  1244  	LEAL	ret+24(FP), AX
  1245  	JMP	runtime·memeqbody(SB)
  1246  eqret:
  1247  	MOVB	$0, ret+24(FP)
  1248  	RET
  1249  
  1250  // a in SI
  1251  // b in DI
  1252  // count in BX
  1253  // address of result byte in AX
  1254  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1255  	CMPL	BX, $4
  1256  	JB	small
  1257  
  1258  	// 64 bytes at a time using xmm registers
  1259  hugeloop:
  1260  	CMPL	BX, $64
  1261  	JB	bigloop
  1262  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1263  	JE	bigloop
  1264  	MOVOU	(SI), X0
  1265  	MOVOU	(DI), X1
  1266  	MOVOU	16(SI), X2
  1267  	MOVOU	16(DI), X3
  1268  	MOVOU	32(SI), X4
  1269  	MOVOU	32(DI), X5
  1270  	MOVOU	48(SI), X6
  1271  	MOVOU	48(DI), X7
  1272  	PCMPEQB	X1, X0
  1273  	PCMPEQB	X3, X2
  1274  	PCMPEQB	X5, X4
  1275  	PCMPEQB	X7, X6
  1276  	PAND	X2, X0
  1277  	PAND	X6, X4
  1278  	PAND	X4, X0
  1279  	PMOVMSKB X0, DX
  1280  	ADDL	$64, SI
  1281  	ADDL	$64, DI
  1282  	SUBL	$64, BX
  1283  	CMPL	DX, $0xffff
  1284  	JEQ	hugeloop
  1285  	MOVB	$0, (AX)
  1286  	RET
  1287  
  1288  	// 4 bytes at a time using 32-bit register
  1289  bigloop:
  1290  	CMPL	BX, $4
  1291  	JBE	leftover
  1292  	MOVL	(SI), CX
  1293  	MOVL	(DI), DX
  1294  	ADDL	$4, SI
  1295  	ADDL	$4, DI
  1296  	SUBL	$4, BX
  1297  	CMPL	CX, DX
  1298  	JEQ	bigloop
  1299  	MOVB	$0, (AX)
  1300  	RET
  1301  
  1302  	// remaining 0-4 bytes
  1303  leftover:
  1304  	MOVL	-4(SI)(BX*1), CX
  1305  	MOVL	-4(DI)(BX*1), DX
  1306  	CMPL	CX, DX
  1307  	SETEQ	(AX)
  1308  	RET
  1309  
  1310  small:
  1311  	CMPL	BX, $0
  1312  	JEQ	equal
  1313  
  1314  	LEAL	0(BX*8), CX
  1315  	NEGL	CX
  1316  
  1317  	MOVL	SI, DX
  1318  	CMPB	DX, $0xfc
  1319  	JA	si_high
  1320  
  1321  	// load at SI won't cross a page boundary.
  1322  	MOVL	(SI), SI
  1323  	JMP	si_finish
  1324  si_high:
  1325  	// address ends in 111111xx.  Load up to bytes we want, move to correct position.
  1326  	MOVL	-4(SI)(BX*1), SI
  1327  	SHRL	CX, SI
  1328  si_finish:
  1329  
  1330  	// same for DI.
  1331  	MOVL	DI, DX
  1332  	CMPB	DX, $0xfc
  1333  	JA	di_high
  1334  	MOVL	(DI), DI
  1335  	JMP	di_finish
  1336  di_high:
  1337  	MOVL	-4(DI)(BX*1), DI
  1338  	SHRL	CX, DI
  1339  di_finish:
  1340  
  1341  	SUBL	SI, DI
  1342  	SHLL	CX, DI
  1343  equal:
  1344  	SETEQ	(AX)
  1345  	RET
  1346  
  1347  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1348  	MOVL	s1_base+0(FP), SI
  1349  	MOVL	s1_len+4(FP), BX
  1350  	MOVL	s2_base+8(FP), DI
  1351  	MOVL	s2_len+12(FP), DX
  1352  	LEAL	ret+16(FP), AX
  1353  	JMP	runtime·cmpbody(SB)
  1354  
  1355  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1356  	MOVL	s1+0(FP), SI
  1357  	MOVL	s1+4(FP), BX
  1358  	MOVL	s2+12(FP), DI
  1359  	MOVL	s2+16(FP), DX
  1360  	LEAL	ret+24(FP), AX
  1361  	JMP	runtime·cmpbody(SB)
  1362  
  1363  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1364  	MOVL	s+0(FP), SI
  1365  	MOVL	s_len+4(FP), CX
  1366  	MOVB	c+12(FP), AL
  1367  	MOVL	SI, DI
  1368  	CLD; REPN; SCASB
  1369  	JZ 3(PC)
  1370  	MOVL	$-1, ret+16(FP)
  1371  	RET
  1372  	SUBL	SI, DI
  1373  	SUBL	$1, DI
  1374  	MOVL	DI, ret+16(FP)
  1375  	RET
  1376  
  1377  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1378  	MOVL	s+0(FP), SI
  1379  	MOVL	s_len+4(FP), CX
  1380  	MOVB	c+8(FP), AL
  1381  	MOVL	SI, DI
  1382  	CLD; REPN; SCASB
  1383  	JZ 3(PC)
  1384  	MOVL	$-1, ret+12(FP)
  1385  	RET
  1386  	SUBL	SI, DI
  1387  	SUBL	$1, DI
  1388  	MOVL	DI, ret+12(FP)
  1389  	RET
  1390  
  1391  // input:
  1392  //   SI = a
  1393  //   DI = b
  1394  //   BX = alen
  1395  //   DX = blen
  1396  //   AX = address of return word (set to 1/0/-1)
  1397  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1398  	MOVL	DX, BP
  1399  	SUBL	BX, DX // DX = blen-alen
  1400  	CMOVLGT	BX, BP // BP = min(alen, blen)
  1401  	CMPL	SI, DI
  1402  	JEQ	allsame
  1403  	CMPL	BP, $4
  1404  	JB	small
  1405  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1406  	JE	mediumloop
  1407  largeloop:
  1408  	CMPL	BP, $16
  1409  	JB	mediumloop
  1410  	MOVOU	(SI), X0
  1411  	MOVOU	(DI), X1
  1412  	PCMPEQB X0, X1
  1413  	PMOVMSKB X1, BX
  1414  	XORL	$0xffff, BX	// convert EQ to NE
  1415  	JNE	diff16	// branch if at least one byte is not equal
  1416  	ADDL	$16, SI
  1417  	ADDL	$16, DI
  1418  	SUBL	$16, BP
  1419  	JMP	largeloop
  1420  
  1421  diff16:
  1422  	BSFL	BX, BX	// index of first byte that differs
  1423  	XORL	DX, DX
  1424  	MOVB	(SI)(BX*1), CX
  1425  	CMPB	CX, (DI)(BX*1)
  1426  	SETHI	DX
  1427  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1428  	MOVL	DX, (AX)
  1429  	RET
  1430  
  1431  mediumloop:
  1432  	CMPL	BP, $4
  1433  	JBE	_0through4
  1434  	MOVL	(SI), BX
  1435  	MOVL	(DI), CX
  1436  	CMPL	BX, CX
  1437  	JNE	diff4
  1438  	ADDL	$4, SI
  1439  	ADDL	$4, DI
  1440  	SUBL	$4, BP
  1441  	JMP	mediumloop
  1442  
  1443  _0through4:
  1444  	MOVL	-4(SI)(BP*1), BX
  1445  	MOVL	-4(DI)(BP*1), CX
  1446  	CMPL	BX, CX
  1447  	JEQ	allsame
  1448  
  1449  diff4:
  1450  	BSWAPL	BX	// reverse order of bytes
  1451  	BSWAPL	CX
  1452  	XORL	BX, CX	// find bit differences
  1453  	BSRL	CX, CX	// index of highest bit difference
  1454  	SHRL	CX, BX	// move a's bit to bottom
  1455  	ANDL	$1, BX	// mask bit
  1456  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1457  	MOVL	BX, (AX)
  1458  	RET
  1459  
  1460  	// 0-3 bytes in common
  1461  small:
  1462  	LEAL	(BP*8), CX
  1463  	NEGL	CX
  1464  	JEQ	allsame
  1465  
  1466  	// load si
  1467  	CMPB	SI, $0xfc
  1468  	JA	si_high
  1469  	MOVL	(SI), SI
  1470  	JMP	si_finish
  1471  si_high:
  1472  	MOVL	-4(SI)(BP*1), SI
  1473  	SHRL	CX, SI
  1474  si_finish:
  1475  	SHLL	CX, SI
  1476  
  1477  	// same for di
  1478  	CMPB	DI, $0xfc
  1479  	JA	di_high
  1480  	MOVL	(DI), DI
  1481  	JMP	di_finish
  1482  di_high:
  1483  	MOVL	-4(DI)(BP*1), DI
  1484  	SHRL	CX, DI
  1485  di_finish:
  1486  	SHLL	CX, DI
  1487  
  1488  	BSWAPL	SI	// reverse order of bytes
  1489  	BSWAPL	DI
  1490  	XORL	SI, DI	// find bit differences
  1491  	JEQ	allsame
  1492  	BSRL	DI, CX	// index of highest bit difference
  1493  	SHRL	CX, SI	// move a's bit to bottom
  1494  	ANDL	$1, SI	// mask bit
  1495  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1496  	MOVL	BX, (AX)
  1497  	RET
  1498  
  1499  	// all the bytes in common are the same, so we just need
  1500  	// to compare the lengths.
  1501  allsame:
  1502  	XORL	BX, BX
  1503  	XORL	CX, CX
  1504  	TESTL	DX, DX
  1505  	SETLT	BX	// 1 if alen > blen
  1506  	SETEQ	CX	// 1 if alen == blen
  1507  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1508  	MOVL	BX, (AX)
  1509  	RET
  1510  
  1511  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1512  	get_tls(CX)
  1513  	MOVL	g(CX), AX
  1514  	MOVL	g_m(AX), AX
  1515  	MOVL	m_fastrand(AX), DX
  1516  	ADDL	DX, DX
  1517  	MOVL	DX, BX
  1518  	XORL	$0x88888eef, DX
  1519  	CMOVLMI	BX, DX
  1520  	MOVL	DX, m_fastrand(AX)
  1521  	MOVL	DX, ret+0(FP)
  1522  	RET
  1523  
  1524  TEXT runtime·return0(SB), NOSPLIT, $0
  1525  	MOVL	$0, AX
  1526  	RET
  1527  
  1528  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1529  // Must obey the gcc calling convention.
  1530  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1531  	get_tls(CX)
  1532  	MOVL	g(CX), AX
  1533  	MOVL	g_m(AX), AX
  1534  	MOVL	m_curg(AX), AX
  1535  	MOVL	(g_stack+stack_hi)(AX), AX
  1536  	RET
  1537  
  1538  // The top-most function running on a goroutine
  1539  // returns to goexit+PCQuantum.
  1540  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1541  	BYTE	$0x90	// NOP
  1542  	CALL	runtime·goexit1(SB)	// does not return
  1543  	// traceback from goexit1 must hit code range of goexit
  1544  	BYTE	$0x90	// NOP
  1545  
  1546  // Prefetching doesn't seem to help.
  1547  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
  1548  	RET
  1549  
  1550  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
  1551  	RET
  1552  
  1553  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
  1554  	RET
  1555  
  1556  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
  1557  	RET
  1558  
  1559  // Add a module's moduledata to the linked list of moduledata objects.  This
  1560  // is called from .init_array by a function generated in the linker and so
  1561  // follows the platform ABI wrt register preservation -- it only touches AX,
  1562  // CX (implicitly) and DX, but it does not follow the ABI wrt arguments:
  1563  // instead the pointer to the moduledata is passed in AX.
  1564  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1565         MOVL    runtime·lastmoduledatap(SB), DX
  1566         MOVL    AX, moduledata_next(DX)
  1567         MOVL    AX, runtime·lastmoduledatap(SB)
  1568         RET