github.com/fjballest/golang@v0.0.0-20151209143359-e4c5fe594ca8/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	SUBL	$128, SP		// plenty of scratch
    15  	ANDL	$~15, SP
    16  	MOVL	AX, 120(SP)		// save argc, argv away
    17  	MOVL	BX, 124(SP)
    18  
    19  	// set default stack bounds.
    20  	// _cgo_init may update stackguard.
    21  	MOVL	$runtime·g0(SB), BP
    22  	LEAL	(-64*1024+104)(SP), BX
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	BX, g_stackguard1(BP)
    25  	MOVL	BX, (g_stack+stack_lo)(BP)
    26  	MOVL	SP, (g_stack+stack_hi)(BP)
    27  	
    28  	// find out information about the processor we're on
    29  #ifdef GOOS_nacl // NaCl doesn't like PUSHFL/POPFL
    30  	JMP 	has_cpuid
    31  #else
    32  	// first see if CPUID instruction is supported.
    33  	PUSHFL
    34  	PUSHFL
    35  	XORL	$(1<<21), 0(SP) // flip ID bit
    36  	POPFL
    37  	PUSHFL
    38  	POPL	AX
    39  	XORL	0(SP), AX
    40  	POPFL	// restore EFLAGS
    41  	TESTL	$(1<<21), AX
    42  	JNE 	has_cpuid
    43  #endif
    44  
    45  bad_proc: // show that the program requires MMX.
    46  	MOVL	$2, 0(SP)
    47  	MOVL	$bad_proc_msg<>(SB), 4(SP)
    48  	MOVL	$0x3d, 8(SP)
    49  	CALL	runtime·write(SB)
    50  	MOVL	$1, 0(SP)
    51  	CALL	runtime·exit(SB)
    52  	INT	$3
    53  
    54  has_cpuid:
    55  	MOVL	$0, AX
    56  	CPUID
    57  	CMPL	AX, $0
    58  	JE	nocpuinfo
    59  
    60  	// Figure out how to serialize RDTSC.
    61  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    62  	// Don't know about the rest, so let's do MFENCE.
    63  	CMPL	BX, $0x756E6547  // "Genu"
    64  	JNE	notintel
    65  	CMPL	DX, $0x49656E69  // "ineI"
    66  	JNE	notintel
    67  	CMPL	CX, $0x6C65746E  // "ntel"
    68  	JNE	notintel
    69  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    70  notintel:
    71  
    72  	MOVL	$1, AX
    73  	CPUID
    74  	MOVL	CX, AX // Move to global variable clobbers CX when generating PIC
    75  	MOVL	AX, runtime·cpuid_ecx(SB)
    76  	MOVL	DX, runtime·cpuid_edx(SB)
    77  
    78  	// Check for MMX support
    79  	TESTL	$(1<<23), DX	// MMX
    80  	JZ 	bad_proc
    81  
    82  nocpuinfo:	
    83  
    84  	// if there is an _cgo_init, call it to let it
    85  	// initialize and to set up GS.  if not,
    86  	// we set up GS ourselves.
    87  	MOVL	_cgo_init(SB), AX
    88  	TESTL	AX, AX
    89  	JZ	needtls
    90  	MOVL	$setg_gcc<>(SB), BX
    91  	MOVL	BX, 4(SP)
    92  	MOVL	BP, 0(SP)
    93  	CALL	AX
    94  
    95  	// update stackguard after _cgo_init
    96  	MOVL	$runtime·g0(SB), CX
    97  	MOVL	(g_stack+stack_lo)(CX), AX
    98  	ADDL	$const__StackGuard, AX
    99  	MOVL	AX, g_stackguard0(CX)
   100  	MOVL	AX, g_stackguard1(CX)
   101  
   102  #ifndef GOOS_windows
   103  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
   104  	JMP ok
   105  #endif
   106  needtls:
   107  #ifdef GOOS_plan9
   108  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
   109  	JMP	ok
   110  #endif
   111  
   112  	// set up %gs
   113  	CALL	runtime·ldt0setup(SB)
   114  
   115  	// store through it, to make sure it works
   116  	get_tls(BX)
   117  	MOVL	$0x123, g(BX)
   118  	MOVL	runtime·m0+m_tls(SB), AX
   119  	CMPL	AX, $0x123
   120  	JEQ	ok
   121  	MOVL	AX, 0	// abort
   122  ok:
   123  	// set up m and g "registers"
   124  	get_tls(BX)
   125  	LEAL	runtime·g0(SB), DX
   126  	MOVL	DX, g(BX)
   127  	LEAL	runtime·m0(SB), AX
   128  
   129  	// save m->g0 = g0
   130  	MOVL	DX, m_g0(AX)
   131  	// save g0->m = m0
   132  	MOVL	AX, g_m(DX)
   133  
   134  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   135  
   136  	// convention is D is always cleared
   137  	CLD
   138  
   139  	CALL	runtime·check(SB)
   140  
   141  	// saved argc, argv
   142  	MOVL	120(SP), AX
   143  	MOVL	AX, 0(SP)
   144  	MOVL	124(SP), AX
   145  	MOVL	AX, 4(SP)
   146  	CALL	runtime·args(SB)
   147  	CALL	runtime·osinit(SB)
   148  	CALL	runtime·schedinit(SB)
   149  
   150  	// create a new goroutine to start program
   151  	PUSHL	$runtime·mainPC(SB)	// entry
   152  	PUSHL	$0	// arg size
   153  	CALL	runtime·newproc(SB)
   154  	POPL	AX
   155  	POPL	AX
   156  
   157  	// start this M
   158  	CALL	runtime·mstart(SB)
   159  
   160  	INT $3
   161  	RET
   162  
   163  DATA	bad_proc_msg<>+0x00(SB)/8, $"This pro"
   164  DATA	bad_proc_msg<>+0x08(SB)/8, $"gram can"
   165  DATA	bad_proc_msg<>+0x10(SB)/8, $" only be"
   166  DATA	bad_proc_msg<>+0x18(SB)/8, $" run on "
   167  DATA	bad_proc_msg<>+0x20(SB)/8, $"processe"
   168  DATA	bad_proc_msg<>+0x28(SB)/8, $"rs with "
   169  DATA	bad_proc_msg<>+0x30(SB)/8, $"MMX supp"
   170  DATA	bad_proc_msg<>+0x38(SB)/4, $"ort."
   171  DATA	bad_proc_msg<>+0x3c(SB)/1, $0xa
   172  GLOBL	bad_proc_msg<>(SB), RODATA, $0x3d
   173  
   174  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   175  GLOBL	runtime·mainPC(SB),RODATA,$4
   176  
   177  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   178  	INT $3
   179  	RET
   180  
   181  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   182  	// Linux and MinGW start the FPU in extended double precision.
   183  	// Other operating systems use double precision.
   184  	// Change to double precision to match them,
   185  	// and to match other hardware that only has double.
   186  	PUSHL $0x27F
   187  	FLDCW	0(SP)
   188  	POPL AX
   189  	RET
   190  
   191  /*
   192   *  go-routine
   193   */
   194  
   195  // void gosave(Gobuf*)
   196  // save state in Gobuf; setjmp
   197  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   198  	MOVL	buf+0(FP), AX		// gobuf
   199  	LEAL	buf+0(FP), BX		// caller's SP
   200  	MOVL	BX, gobuf_sp(AX)
   201  	MOVL	0(SP), BX		// caller's PC
   202  	MOVL	BX, gobuf_pc(AX)
   203  	MOVL	$0, gobuf_ret(AX)
   204  	MOVL	$0, gobuf_ctxt(AX)
   205  	get_tls(CX)
   206  	MOVL	g(CX), BX
   207  	MOVL	BX, gobuf_g(AX)
   208  	RET
   209  
   210  // void gogo(Gobuf*)
   211  // restore state from Gobuf; longjmp
   212  TEXT runtime·gogo(SB), NOSPLIT, $0-4
   213  	MOVL	buf+0(FP), BX		// gobuf
   214  	MOVL	gobuf_g(BX), DX
   215  	MOVL	0(DX), CX		// make sure g != nil
   216  	get_tls(CX)
   217  	MOVL	DX, g(CX)
   218  	MOVL	gobuf_sp(BX), SP	// restore SP
   219  	MOVL	gobuf_ret(BX), AX
   220  	MOVL	gobuf_ctxt(BX), DX
   221  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   222  	MOVL	$0, gobuf_ret(BX)
   223  	MOVL	$0, gobuf_ctxt(BX)
   224  	MOVL	gobuf_pc(BX), BX
   225  	JMP	BX
   226  
   227  // func mcall(fn func(*g))
   228  // Switch to m->g0's stack, call fn(g).
   229  // Fn must never return.  It should gogo(&g->sched)
   230  // to keep running g.
   231  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   232  	MOVL	fn+0(FP), DI
   233  
   234  	get_tls(DX)
   235  	MOVL	g(DX), AX	// save state in g->sched
   236  	MOVL	0(SP), BX	// caller's PC
   237  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   238  	LEAL	fn+0(FP), BX	// caller's SP
   239  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   240  	MOVL	AX, (g_sched+gobuf_g)(AX)
   241  
   242  	// switch to m->g0 & its stack, call fn
   243  	MOVL	g(DX), BX
   244  	MOVL	g_m(BX), BX
   245  	MOVL	m_g0(BX), SI
   246  	CMPL	SI, AX	// if g == m->g0 call badmcall
   247  	JNE	3(PC)
   248  	MOVL	$runtime·badmcall(SB), AX
   249  	JMP	AX
   250  	MOVL	SI, g(DX)	// g = m->g0
   251  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   252  	PUSHL	AX
   253  	MOVL	DI, DX
   254  	MOVL	0(DI), DI
   255  	CALL	DI
   256  	POPL	AX
   257  	MOVL	$runtime·badmcall2(SB), AX
   258  	JMP	AX
   259  	RET
   260  
   261  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   262  // of the G stack.  We need to distinguish the routine that
   263  // lives at the bottom of the G stack from the one that lives
   264  // at the top of the system stack because the one at the top of
   265  // the system stack terminates the stack walk (see topofstack()).
   266  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   267  	RET
   268  
   269  // func systemstack(fn func())
   270  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   271  	MOVL	fn+0(FP), DI	// DI = fn
   272  	get_tls(CX)
   273  	MOVL	g(CX), AX	// AX = g
   274  	MOVL	g_m(AX), BX	// BX = m
   275  
   276  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   277  	CMPL	AX, DX
   278  	JEQ	noswitch
   279  
   280  	MOVL	m_g0(BX), DX	// DX = g0
   281  	CMPL	AX, DX
   282  	JEQ	noswitch
   283  
   284  	MOVL	m_curg(BX), BP
   285  	CMPL	AX, BP
   286  	JEQ	switch
   287  	
   288  	// Bad: g is not gsignal, not g0, not curg. What is it?
   289  	// Hide call from linker nosplit analysis.
   290  	MOVL	$runtime·badsystemstack(SB), AX
   291  	CALL	AX
   292  
   293  switch:
   294  	// save our state in g->sched.  Pretend to
   295  	// be systemstack_switch if the G stack is scanned.
   296  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   297  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   298  	MOVL	AX, (g_sched+gobuf_g)(AX)
   299  
   300  	// switch to g0
   301  	get_tls(CX)
   302  	MOVL	DX, g(CX)
   303  	MOVL	(g_sched+gobuf_sp)(DX), BX
   304  	// make it look like mstart called systemstack on g0, to stop traceback
   305  	SUBL	$4, BX
   306  	MOVL	$runtime·mstart(SB), DX
   307  	MOVL	DX, 0(BX)
   308  	MOVL	BX, SP
   309  
   310  	// call target function
   311  	MOVL	DI, DX
   312  	MOVL	0(DI), DI
   313  	CALL	DI
   314  
   315  	// switch back to g
   316  	get_tls(CX)
   317  	MOVL	g(CX), AX
   318  	MOVL	g_m(AX), BX
   319  	MOVL	m_curg(BX), AX
   320  	MOVL	AX, g(CX)
   321  	MOVL	(g_sched+gobuf_sp)(AX), SP
   322  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   323  	RET
   324  
   325  noswitch:
   326  	// already on system stack, just call directly
   327  	MOVL	DI, DX
   328  	MOVL	0(DI), DI
   329  	CALL	DI
   330  	RET
   331  
   332  /*
   333   * support for morestack
   334   */
   335  
   336  // Called during function prolog when more stack is needed.
   337  //
   338  // The traceback routines see morestack on a g0 as being
   339  // the top of a stack (for example, morestack calling newstack
   340  // calling the scheduler calling newm calling gc), so we must
   341  // record an argument size. For that purpose, it has no arguments.
   342  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   343  	// Cannot grow scheduler stack (m->g0).
   344  	get_tls(CX)
   345  	MOVL	g(CX), BX
   346  	MOVL	g_m(BX), BX
   347  	MOVL	m_g0(BX), SI
   348  	CMPL	g(CX), SI
   349  	JNE	2(PC)
   350  	INT	$3
   351  
   352  	// Cannot grow signal stack.
   353  	MOVL	m_gsignal(BX), SI
   354  	CMPL	g(CX), SI
   355  	JNE	2(PC)
   356  	INT	$3
   357  
   358  	// Called from f.
   359  	// Set m->morebuf to f's caller.
   360  	MOVL	4(SP), DI	// f's caller's PC
   361  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   362  	LEAL	8(SP), CX	// f's caller's SP
   363  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   364  	get_tls(CX)
   365  	MOVL	g(CX), SI
   366  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   367  
   368  	// Set g->sched to context in f.
   369  	MOVL	0(SP), AX	// f's PC
   370  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   371  	MOVL	SI, (g_sched+gobuf_g)(SI)
   372  	LEAL	4(SP), AX	// f's SP
   373  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   374  	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
   375  
   376  	// Call newstack on m->g0's stack.
   377  	MOVL	m_g0(BX), BP
   378  	MOVL	BP, g(CX)
   379  	MOVL	(g_sched+gobuf_sp)(BP), AX
   380  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   381  	MOVL	AX, SP
   382  	CALL	runtime·newstack(SB)
   383  	MOVL	$0, 0x1003	// crash if newstack returns
   384  	RET
   385  
   386  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   387  	MOVL	$0, DX
   388  	JMP runtime·morestack(SB)
   389  
   390  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   391  	// We came here via a RET to an overwritten return PC.
   392  	// AX may be live. Other registers are available.
   393  
   394  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   395  	get_tls(CX)
   396  	MOVL	g(CX), CX
   397  	MOVL	(g_stkbar+slice_array)(CX), DX
   398  	MOVL	g_stkbarPos(CX), BX
   399  	IMULL	$stkbar__size, BX	// Too big for SIB.
   400  	MOVL	stkbar_savedLRVal(DX)(BX*1), BX
   401  	// Record that this stack barrier was hit.
   402  	ADDL	$1, g_stkbarPos(CX)
   403  	// Jump to the original return PC.
   404  	JMP	BX
   405  
   406  // reflectcall: call a function with the given argument list
   407  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   408  // we don't have variable-sized frames, so we use a small number
   409  // of constant-sized-frame functions to encode a few bits of size in the pc.
   410  // Caution: ugly multiline assembly macros in your future!
   411  
   412  #define DISPATCH(NAME,MAXSIZE)		\
   413  	CMPL	CX, $MAXSIZE;		\
   414  	JA	3(PC);			\
   415  	MOVL	$NAME(SB), AX;		\
   416  	JMP	AX
   417  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   418  
   419  TEXT reflect·call(SB), NOSPLIT, $0-0
   420  	JMP	·reflectcall(SB)
   421  
   422  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   423  	MOVL	argsize+12(FP), CX
   424  	DISPATCH(runtime·call16, 16)
   425  	DISPATCH(runtime·call32, 32)
   426  	DISPATCH(runtime·call64, 64)
   427  	DISPATCH(runtime·call128, 128)
   428  	DISPATCH(runtime·call256, 256)
   429  	DISPATCH(runtime·call512, 512)
   430  	DISPATCH(runtime·call1024, 1024)
   431  	DISPATCH(runtime·call2048, 2048)
   432  	DISPATCH(runtime·call4096, 4096)
   433  	DISPATCH(runtime·call8192, 8192)
   434  	DISPATCH(runtime·call16384, 16384)
   435  	DISPATCH(runtime·call32768, 32768)
   436  	DISPATCH(runtime·call65536, 65536)
   437  	DISPATCH(runtime·call131072, 131072)
   438  	DISPATCH(runtime·call262144, 262144)
   439  	DISPATCH(runtime·call524288, 524288)
   440  	DISPATCH(runtime·call1048576, 1048576)
   441  	DISPATCH(runtime·call2097152, 2097152)
   442  	DISPATCH(runtime·call4194304, 4194304)
   443  	DISPATCH(runtime·call8388608, 8388608)
   444  	DISPATCH(runtime·call16777216, 16777216)
   445  	DISPATCH(runtime·call33554432, 33554432)
   446  	DISPATCH(runtime·call67108864, 67108864)
   447  	DISPATCH(runtime·call134217728, 134217728)
   448  	DISPATCH(runtime·call268435456, 268435456)
   449  	DISPATCH(runtime·call536870912, 536870912)
   450  	DISPATCH(runtime·call1073741824, 1073741824)
   451  	MOVL	$runtime·badreflectcall(SB), AX
   452  	JMP	AX
   453  
   454  #define CALLFN(NAME,MAXSIZE)			\
   455  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   456  	NO_LOCAL_POINTERS;			\
   457  	/* copy arguments to stack */		\
   458  	MOVL	argptr+8(FP), SI;		\
   459  	MOVL	argsize+12(FP), CX;		\
   460  	MOVL	SP, DI;				\
   461  	REP;MOVSB;				\
   462  	/* call function */			\
   463  	MOVL	f+4(FP), DX;			\
   464  	MOVL	(DX), AX; 			\
   465  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   466  	CALL	AX;				\
   467  	/* copy return values back */		\
   468  	MOVL	argptr+8(FP), DI;		\
   469  	MOVL	argsize+12(FP), CX;		\
   470  	MOVL	retoffset+16(FP), BX;		\
   471  	MOVL	SP, SI;				\
   472  	ADDL	BX, DI;				\
   473  	ADDL	BX, SI;				\
   474  	SUBL	BX, CX;				\
   475  	REP;MOVSB;				\
   476  	/* execute write barrier updates */	\
   477  	MOVL	argtype+0(FP), DX;		\
   478  	MOVL	argptr+8(FP), DI;		\
   479  	MOVL	argsize+12(FP), CX;		\
   480  	MOVL	retoffset+16(FP), BX;		\
   481  	MOVL	DX, 0(SP);			\
   482  	MOVL	DI, 4(SP);			\
   483  	MOVL	CX, 8(SP);			\
   484  	MOVL	BX, 12(SP);			\
   485  	CALL	runtime·callwritebarrier(SB);	\
   486  	RET
   487  
   488  CALLFN(·call16, 16)
   489  CALLFN(·call32, 32)
   490  CALLFN(·call64, 64)
   491  CALLFN(·call128, 128)
   492  CALLFN(·call256, 256)
   493  CALLFN(·call512, 512)
   494  CALLFN(·call1024, 1024)
   495  CALLFN(·call2048, 2048)
   496  CALLFN(·call4096, 4096)
   497  CALLFN(·call8192, 8192)
   498  CALLFN(·call16384, 16384)
   499  CALLFN(·call32768, 32768)
   500  CALLFN(·call65536, 65536)
   501  CALLFN(·call131072, 131072)
   502  CALLFN(·call262144, 262144)
   503  CALLFN(·call524288, 524288)
   504  CALLFN(·call1048576, 1048576)
   505  CALLFN(·call2097152, 2097152)
   506  CALLFN(·call4194304, 4194304)
   507  CALLFN(·call8388608, 8388608)
   508  CALLFN(·call16777216, 16777216)
   509  CALLFN(·call33554432, 33554432)
   510  CALLFN(·call67108864, 67108864)
   511  CALLFN(·call134217728, 134217728)
   512  CALLFN(·call268435456, 268435456)
   513  CALLFN(·call536870912, 536870912)
   514  CALLFN(·call1073741824, 1073741824)
   515  
   516  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   517  	MOVL	cycles+0(FP), AX
   518  again:
   519  	PAUSE
   520  	SUBL	$1, AX
   521  	JNZ	again
   522  	RET
   523  
   524  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   525  	// Stores are already ordered on x86, so this is just a
   526  	// compile barrier.
   527  	RET
   528  
   529  // void jmpdefer(fn, sp);
   530  // called from deferreturn.
   531  // 1. pop the caller
   532  // 2. sub 5 bytes from the callers return
   533  // 3. jmp to the argument
   534  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   535  	MOVL	fv+0(FP), DX	// fn
   536  	MOVL	argp+4(FP), BX	// caller sp
   537  	LEAL	-4(BX), SP	// caller sp after CALL
   538  	SUBL	$5, (SP)	// return to CALL again
   539  	MOVL	0(DX), BX
   540  	JMP	BX	// but first run the deferred function
   541  
   542  // Save state of caller into g->sched.
   543  TEXT gosave<>(SB),NOSPLIT,$0
   544  	PUSHL	AX
   545  	PUSHL	BX
   546  	get_tls(BX)
   547  	MOVL	g(BX), BX
   548  	LEAL	arg+0(FP), AX
   549  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   550  	MOVL	-4(AX), AX
   551  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   552  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   553  	MOVL	$0, (g_sched+gobuf_ctxt)(BX)
   554  	POPL	BX
   555  	POPL	AX
   556  	RET
   557  
   558  // func asmcgocall(fn, arg unsafe.Pointer) int32
   559  // Call fn(arg) on the scheduler stack,
   560  // aligned appropriately for the gcc ABI.
   561  // See cgocall.go for more details.
   562  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   563  	MOVL	fn+0(FP), AX
   564  	MOVL	arg+4(FP), BX
   565  
   566  	MOVL	SP, DX
   567  
   568  	// Figure out if we need to switch to m->g0 stack.
   569  	// We get called to create new OS threads too, and those
   570  	// come in on the m->g0 stack already.
   571  	get_tls(CX)
   572  	MOVL	g(CX), BP
   573  	MOVL	g_m(BP), BP
   574  	MOVL	m_g0(BP), SI
   575  	MOVL	g(CX), DI
   576  	CMPL	SI, DI
   577  	JEQ	noswitch
   578  	CALL	gosave<>(SB)
   579  	get_tls(CX)
   580  	MOVL	SI, g(CX)
   581  	MOVL	(g_sched+gobuf_sp)(SI), SP
   582  
   583  noswitch:
   584  	// Now on a scheduling stack (a pthread-created stack).
   585  	SUBL	$32, SP
   586  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   587  	MOVL	DI, 8(SP)	// save g
   588  	MOVL	(g_stack+stack_hi)(DI), DI
   589  	SUBL	DX, DI
   590  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   591  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   592  	CALL	AX
   593  
   594  	// Restore registers, g, stack pointer.
   595  	get_tls(CX)
   596  	MOVL	8(SP), DI
   597  	MOVL	(g_stack+stack_hi)(DI), SI
   598  	SUBL	4(SP), SI
   599  	MOVL	DI, g(CX)
   600  	MOVL	SI, SP
   601  
   602  	MOVL	AX, ret+8(FP)
   603  	RET
   604  
   605  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   606  // Turn the fn into a Go func (by taking its address) and call
   607  // cgocallback_gofunc.
   608  TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
   609  	LEAL	fn+0(FP), AX
   610  	MOVL	AX, 0(SP)
   611  	MOVL	frame+4(FP), AX
   612  	MOVL	AX, 4(SP)
   613  	MOVL	framesize+8(FP), AX
   614  	MOVL	AX, 8(SP)
   615  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   616  	CALL	AX
   617  	RET
   618  
   619  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   620  // See cgocall.go for more details.
   621  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-12
   622  	NO_LOCAL_POINTERS
   623  
   624  	// If g is nil, Go did not create the current thread.
   625  	// Call needm to obtain one for temporary use.
   626  	// In this case, we're running on the thread stack, so there's
   627  	// lots of space, but the linker doesn't know. Hide the call from
   628  	// the linker analysis by using an indirect call through AX.
   629  	get_tls(CX)
   630  #ifdef GOOS_windows
   631  	MOVL	$0, BP
   632  	CMPL	CX, $0
   633  	JEQ	2(PC) // TODO
   634  #endif
   635  	MOVL	g(CX), BP
   636  	CMPL	BP, $0
   637  	JEQ	needm
   638  	MOVL	g_m(BP), BP
   639  	MOVL	BP, DX // saved copy of oldm
   640  	JMP	havem
   641  needm:
   642  	MOVL	$0, 0(SP)
   643  	MOVL	$runtime·needm(SB), AX
   644  	CALL	AX
   645  	MOVL	0(SP), DX
   646  	get_tls(CX)
   647  	MOVL	g(CX), BP
   648  	MOVL	g_m(BP), BP
   649  
   650  	// Set m->sched.sp = SP, so that if a panic happens
   651  	// during the function we are about to execute, it will
   652  	// have a valid SP to run on the g0 stack.
   653  	// The next few lines (after the havem label)
   654  	// will save this SP onto the stack and then write
   655  	// the same SP back to m->sched.sp. That seems redundant,
   656  	// but if an unrecovered panic happens, unwindm will
   657  	// restore the g->sched.sp from the stack location
   658  	// and then systemstack will try to use it. If we don't set it here,
   659  	// that restored SP will be uninitialized (typically 0) and
   660  	// will not be usable.
   661  	MOVL	m_g0(BP), SI
   662  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   663  
   664  havem:
   665  	// Now there's a valid m, and we're running on its m->g0.
   666  	// Save current m->g0->sched.sp on stack and then set it to SP.
   667  	// Save current sp in m->g0->sched.sp in preparation for
   668  	// switch back to m->curg stack.
   669  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   670  	MOVL	m_g0(BP), SI
   671  	MOVL	(g_sched+gobuf_sp)(SI), AX
   672  	MOVL	AX, 0(SP)
   673  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   674  
   675  	// Switch to m->curg stack and call runtime.cgocallbackg.
   676  	// Because we are taking over the execution of m->curg
   677  	// but *not* resuming what had been running, we need to
   678  	// save that information (m->curg->sched) so we can restore it.
   679  	// We can restore m->curg->sched.sp easily, because calling
   680  	// runtime.cgocallbackg leaves SP unchanged upon return.
   681  	// To save m->curg->sched.pc, we push it onto the stack.
   682  	// This has the added benefit that it looks to the traceback
   683  	// routine like cgocallbackg is going to return to that
   684  	// PC (because the frame we allocate below has the same
   685  	// size as cgocallback_gofunc's frame declared above)
   686  	// so that the traceback will seamlessly trace back into
   687  	// the earlier calls.
   688  	//
   689  	// In the new goroutine, 0(SP) holds the saved oldm (DX) register.
   690  	// 4(SP) and 8(SP) are unused.
   691  	MOVL	m_curg(BP), SI
   692  	MOVL	SI, g(CX)
   693  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   694  	MOVL	(g_sched+gobuf_pc)(SI), BP
   695  	MOVL	BP, -4(DI)
   696  	LEAL	-(4+12)(DI), SP
   697  	MOVL	DX, 0(SP)
   698  	CALL	runtime·cgocallbackg(SB)
   699  	MOVL	0(SP), DX
   700  
   701  	// Restore g->sched (== m->curg->sched) from saved values.
   702  	get_tls(CX)
   703  	MOVL	g(CX), SI
   704  	MOVL	12(SP), BP
   705  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   706  	LEAL	(12+4)(SP), DI
   707  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   708  
   709  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   710  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   711  	// so we do not have to restore it.)
   712  	MOVL	g(CX), BP
   713  	MOVL	g_m(BP), BP
   714  	MOVL	m_g0(BP), SI
   715  	MOVL	SI, g(CX)
   716  	MOVL	(g_sched+gobuf_sp)(SI), SP
   717  	MOVL	0(SP), AX
   718  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   719  	
   720  	// If the m on entry was nil, we called needm above to borrow an m
   721  	// for the duration of the call. Since the call is over, return it with dropm.
   722  	CMPL	DX, $0
   723  	JNE 3(PC)
   724  	MOVL	$runtime·dropm(SB), AX
   725  	CALL	AX
   726  
   727  	// Done!
   728  	RET
   729  
   730  // void setg(G*); set g. for use by needm.
   731  TEXT runtime·setg(SB), NOSPLIT, $0-4
   732  	MOVL	gg+0(FP), BX
   733  #ifdef GOOS_windows
   734  	CMPL	BX, $0
   735  	JNE	settls
   736  	MOVL	$0, 0x14(FS)
   737  	RET
   738  settls:
   739  	MOVL	g_m(BX), AX
   740  	LEAL	m_tls(AX), AX
   741  	MOVL	AX, 0x14(FS)
   742  #endif
   743  	get_tls(CX)
   744  	MOVL	BX, g(CX)
   745  	RET
   746  
   747  // void setg_gcc(G*); set g. for use by gcc
   748  TEXT setg_gcc<>(SB), NOSPLIT, $0
   749  	get_tls(AX)
   750  	MOVL	gg+0(FP), DX
   751  	MOVL	DX, g(AX)
   752  	RET
   753  
   754  // check that SP is in range [g->stack.lo, g->stack.hi)
   755  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   756  	get_tls(CX)
   757  	MOVL	g(CX), AX
   758  	CMPL	(g_stack+stack_hi)(AX), SP
   759  	JHI	2(PC)
   760  	INT	$3
   761  	CMPL	SP, (g_stack+stack_lo)(AX)
   762  	JHI	2(PC)
   763  	INT	$3
   764  	RET
   765  
   766  TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
   767  	MOVL	argp+0(FP),AX		// addr of first arg
   768  	MOVL	-4(AX),AX		// get calling pc
   769  	CMPL	AX, runtime·stackBarrierPC(SB)
   770  	JNE	nobar
   771  	// Get original return PC.
   772  	CALL	runtime·nextBarrierPC(SB)
   773  	MOVL	0(SP), AX
   774  nobar:
   775  	MOVL	AX, ret+4(FP)
   776  	RET
   777  
   778  TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
   779  	MOVL	argp+0(FP),AX		// addr of first arg
   780  	MOVL	pc+4(FP), BX
   781  	MOVL	-4(AX), DX
   782  	CMPL	DX, runtime·stackBarrierPC(SB)
   783  	JEQ	setbar
   784  	MOVL	BX, -4(AX)		// set calling pc
   785  	RET
   786  setbar:
   787  	// Set the stack barrier return PC.
   788  	MOVL	BX, 0(SP)
   789  	CALL	runtime·setNextBarrierPC(SB)
   790  	RET
   791  
   792  TEXT runtime·getcallersp(SB), NOSPLIT, $0-8
   793  	MOVL	argp+0(FP), AX
   794  	MOVL	AX, ret+4(FP)
   795  	RET
   796  
   797  // func cputicks() int64
   798  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   799  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence
   800  	JEQ	done
   801  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   802  	JNE	mfence
   803  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   804  	JMP	done
   805  mfence:
   806  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   807  done:
   808  	RDTSC
   809  	MOVL	AX, ret_lo+0(FP)
   810  	MOVL	DX, ret_hi+4(FP)
   811  	RET
   812  
   813  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   814  	// set up ldt 7 to point at m0.tls
   815  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   816  	// the entry number is just a hint.  setldt will set up GS with what it used.
   817  	MOVL	$7, 0(SP)
   818  	LEAL	runtime·m0+m_tls(SB), AX
   819  	MOVL	AX, 4(SP)
   820  	MOVL	$32, 8(SP)	// sizeof(tls array)
   821  	CALL	runtime·setldt(SB)
   822  	RET
   823  
   824  TEXT runtime·emptyfunc(SB),0,$0-0
   825  	RET
   826  
   827  TEXT runtime·abort(SB),NOSPLIT,$0-0
   828  	INT $0x3
   829  
   830  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   831  // redirects to memhash(p, h, size) using the size
   832  // stored in the closure.
   833  TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
   834  	GO_ARGS
   835  	NO_LOCAL_POINTERS
   836  	MOVL	p+0(FP), AX
   837  	MOVL	h+4(FP), BX
   838  	MOVL	4(DX), CX
   839  	MOVL	AX, 0(SP)
   840  	MOVL	BX, 4(SP)
   841  	MOVL	CX, 8(SP)
   842  	CALL	runtime·memhash(SB)
   843  	MOVL	12(SP), AX
   844  	MOVL	AX, ret+8(FP)
   845  	RET
   846  
   847  // hash function using AES hardware instructions
   848  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   849  	MOVL	p+0(FP), AX	// ptr to data
   850  	MOVL	s+8(FP), BX	// size
   851  	LEAL	ret+12(FP), DX
   852  	JMP	runtime·aeshashbody(SB)
   853  
   854  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   855  	MOVL	p+0(FP), AX	// ptr to string object
   856  	MOVL	4(AX), BX	// length of string
   857  	MOVL	(AX), AX	// string data
   858  	LEAL	ret+8(FP), DX
   859  	JMP	runtime·aeshashbody(SB)
   860  
   861  // AX: data
   862  // BX: length
   863  // DX: address to put return value
   864  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   865  	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
   866  	PINSRW	$4, BX, X0	            // 16 bits of length
   867  	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
   868  	MOVO	X0, X1                      // save unscrambled seed
   869  	PXOR	runtime·aeskeysched(SB), X0 // xor in per-process seed
   870  	AESENC	X0, X0                      // scramble seed
   871  
   872  	CMPL	BX, $16
   873  	JB	aes0to15
   874  	JE	aes16
   875  	CMPL	BX, $32
   876  	JBE	aes17to32
   877  	CMPL	BX, $64
   878  	JBE	aes33to64
   879  	JMP	aes65plus
   880  	
   881  aes0to15:
   882  	TESTL	BX, BX
   883  	JE	aes0
   884  
   885  	ADDL	$16, AX
   886  	TESTW	$0xff0, AX
   887  	JE	endofpage
   888  
   889  	// 16 bytes loaded at this address won't cross
   890  	// a page boundary, so we can load it directly.
   891  	MOVOU	-16(AX), X1
   892  	ADDL	BX, BX
   893  	PAND	masks<>(SB)(BX*8), X1
   894  
   895  final1:	
   896  	AESENC	X0, X1  // scramble input, xor in seed
   897  	AESENC	X1, X1  // scramble combo 2 times
   898  	AESENC	X1, X1
   899  	MOVL	X1, (DX)
   900  	RET
   901  
   902  endofpage:
   903  	// address ends in 1111xxxx.  Might be up against
   904  	// a page boundary, so load ending at last byte.
   905  	// Then shift bytes down using pshufb.
   906  	MOVOU	-32(AX)(BX*1), X1
   907  	ADDL	BX, BX
   908  	PSHUFB	shifts<>(SB)(BX*8), X1
   909  	JMP	final1
   910  
   911  aes0:
   912  	// Return scrambled input seed
   913  	AESENC	X0, X0
   914  	MOVL	X0, (DX)
   915  	RET
   916  
   917  aes16:
   918  	MOVOU	(AX), X1
   919  	JMP	final1
   920  
   921  aes17to32:
   922  	// make second starting seed
   923  	PXOR	runtime·aeskeysched+16(SB), X1
   924  	AESENC	X1, X1
   925  	
   926  	// load data to be hashed
   927  	MOVOU	(AX), X2
   928  	MOVOU	-16(AX)(BX*1), X3
   929  
   930  	// scramble 3 times
   931  	AESENC	X0, X2
   932  	AESENC	X1, X3
   933  	AESENC	X2, X2
   934  	AESENC	X3, X3
   935  	AESENC	X2, X2
   936  	AESENC	X3, X3
   937  
   938  	// combine results
   939  	PXOR	X3, X2
   940  	MOVL	X2, (DX)
   941  	RET
   942  
   943  aes33to64:
   944  	// make 3 more starting seeds
   945  	MOVO	X1, X2
   946  	MOVO	X1, X3
   947  	PXOR	runtime·aeskeysched+16(SB), X1
   948  	PXOR	runtime·aeskeysched+32(SB), X2
   949  	PXOR	runtime·aeskeysched+48(SB), X3
   950  	AESENC	X1, X1
   951  	AESENC	X2, X2
   952  	AESENC	X3, X3
   953  	
   954  	MOVOU	(AX), X4
   955  	MOVOU	16(AX), X5
   956  	MOVOU	-32(AX)(BX*1), X6
   957  	MOVOU	-16(AX)(BX*1), X7
   958  	
   959  	AESENC	X0, X4
   960  	AESENC	X1, X5
   961  	AESENC	X2, X6
   962  	AESENC	X3, X7
   963  	
   964  	AESENC	X4, X4
   965  	AESENC	X5, X5
   966  	AESENC	X6, X6
   967  	AESENC	X7, X7
   968  	
   969  	AESENC	X4, X4
   970  	AESENC	X5, X5
   971  	AESENC	X6, X6
   972  	AESENC	X7, X7
   973  
   974  	PXOR	X6, X4
   975  	PXOR	X7, X5
   976  	PXOR	X5, X4
   977  	MOVL	X4, (DX)
   978  	RET
   979  
   980  aes65plus:
   981  	// make 3 more starting seeds
   982  	MOVO	X1, X2
   983  	MOVO	X1, X3
   984  	PXOR	runtime·aeskeysched+16(SB), X1
   985  	PXOR	runtime·aeskeysched+32(SB), X2
   986  	PXOR	runtime·aeskeysched+48(SB), X3
   987  	AESENC	X1, X1
   988  	AESENC	X2, X2
   989  	AESENC	X3, X3
   990  	
   991  	// start with last (possibly overlapping) block
   992  	MOVOU	-64(AX)(BX*1), X4
   993  	MOVOU	-48(AX)(BX*1), X5
   994  	MOVOU	-32(AX)(BX*1), X6
   995  	MOVOU	-16(AX)(BX*1), X7
   996  
   997  	// scramble state once
   998  	AESENC	X0, X4
   999  	AESENC	X1, X5
  1000  	AESENC	X2, X6
  1001  	AESENC	X3, X7
  1002  
  1003  	// compute number of remaining 64-byte blocks
  1004  	DECL	BX
  1005  	SHRL	$6, BX
  1006  	
  1007  aesloop:
  1008  	// scramble state, xor in a block
  1009  	MOVOU	(AX), X0
  1010  	MOVOU	16(AX), X1
  1011  	MOVOU	32(AX), X2
  1012  	MOVOU	48(AX), X3
  1013  	AESENC	X0, X4
  1014  	AESENC	X1, X5
  1015  	AESENC	X2, X6
  1016  	AESENC	X3, X7
  1017  
  1018  	// scramble state
  1019  	AESENC	X4, X4
  1020  	AESENC	X5, X5
  1021  	AESENC	X6, X6
  1022  	AESENC	X7, X7
  1023  
  1024  	ADDL	$64, AX
  1025  	DECL	BX
  1026  	JNE	aesloop
  1027  
  1028  	// 2 more scrambles to finish
  1029  	AESENC	X4, X4
  1030  	AESENC	X5, X5
  1031  	AESENC	X6, X6
  1032  	AESENC	X7, X7
  1033  	
  1034  	AESENC	X4, X4
  1035  	AESENC	X5, X5
  1036  	AESENC	X6, X6
  1037  	AESENC	X7, X7
  1038  
  1039  	PXOR	X6, X4
  1040  	PXOR	X7, X5
  1041  	PXOR	X5, X4
  1042  	MOVL	X4, (DX)
  1043  	RET
  1044  
  1045  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1046  	MOVL	p+0(FP), AX	// ptr to data
  1047  	MOVL	h+4(FP), X0	// seed
  1048  	PINSRD	$1, (AX), X0	// data
  1049  	AESENC	runtime·aeskeysched+0(SB), X0
  1050  	AESENC	runtime·aeskeysched+16(SB), X0
  1051  	AESENC	runtime·aeskeysched+32(SB), X0
  1052  	MOVL	X0, ret+8(FP)
  1053  	RET
  1054  
  1055  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1056  	MOVL	p+0(FP), AX	// ptr to data
  1057  	MOVQ	(AX), X0	// data
  1058  	PINSRD	$2, h+4(FP), X0	// seed
  1059  	AESENC	runtime·aeskeysched+0(SB), X0
  1060  	AESENC	runtime·aeskeysched+16(SB), X0
  1061  	AESENC	runtime·aeskeysched+32(SB), X0
  1062  	MOVL	X0, ret+8(FP)
  1063  	RET
  1064  
  1065  // simple mask to get rid of data in the high part of the register.
  1066  DATA masks<>+0x00(SB)/4, $0x00000000
  1067  DATA masks<>+0x04(SB)/4, $0x00000000
  1068  DATA masks<>+0x08(SB)/4, $0x00000000
  1069  DATA masks<>+0x0c(SB)/4, $0x00000000
  1070  	
  1071  DATA masks<>+0x10(SB)/4, $0x000000ff
  1072  DATA masks<>+0x14(SB)/4, $0x00000000
  1073  DATA masks<>+0x18(SB)/4, $0x00000000
  1074  DATA masks<>+0x1c(SB)/4, $0x00000000
  1075  	
  1076  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1077  DATA masks<>+0x24(SB)/4, $0x00000000
  1078  DATA masks<>+0x28(SB)/4, $0x00000000
  1079  DATA masks<>+0x2c(SB)/4, $0x00000000
  1080  	
  1081  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1082  DATA masks<>+0x34(SB)/4, $0x00000000
  1083  DATA masks<>+0x38(SB)/4, $0x00000000
  1084  DATA masks<>+0x3c(SB)/4, $0x00000000
  1085  	
  1086  DATA masks<>+0x40(SB)/4, $0xffffffff
  1087  DATA masks<>+0x44(SB)/4, $0x00000000
  1088  DATA masks<>+0x48(SB)/4, $0x00000000
  1089  DATA masks<>+0x4c(SB)/4, $0x00000000
  1090  	
  1091  DATA masks<>+0x50(SB)/4, $0xffffffff
  1092  DATA masks<>+0x54(SB)/4, $0x000000ff
  1093  DATA masks<>+0x58(SB)/4, $0x00000000
  1094  DATA masks<>+0x5c(SB)/4, $0x00000000
  1095  	
  1096  DATA masks<>+0x60(SB)/4, $0xffffffff
  1097  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1098  DATA masks<>+0x68(SB)/4, $0x00000000
  1099  DATA masks<>+0x6c(SB)/4, $0x00000000
  1100  	
  1101  DATA masks<>+0x70(SB)/4, $0xffffffff
  1102  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1103  DATA masks<>+0x78(SB)/4, $0x00000000
  1104  DATA masks<>+0x7c(SB)/4, $0x00000000
  1105  	
  1106  DATA masks<>+0x80(SB)/4, $0xffffffff
  1107  DATA masks<>+0x84(SB)/4, $0xffffffff
  1108  DATA masks<>+0x88(SB)/4, $0x00000000
  1109  DATA masks<>+0x8c(SB)/4, $0x00000000
  1110  	
  1111  DATA masks<>+0x90(SB)/4, $0xffffffff
  1112  DATA masks<>+0x94(SB)/4, $0xffffffff
  1113  DATA masks<>+0x98(SB)/4, $0x000000ff
  1114  DATA masks<>+0x9c(SB)/4, $0x00000000
  1115  	
  1116  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1117  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1118  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1119  DATA masks<>+0xac(SB)/4, $0x00000000
  1120  	
  1121  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1122  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1123  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1124  DATA masks<>+0xbc(SB)/4, $0x00000000
  1125  	
  1126  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1127  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1128  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1129  DATA masks<>+0xcc(SB)/4, $0x00000000
  1130  	
  1131  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1132  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1133  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1134  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1135  	
  1136  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1137  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1138  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1139  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1140  	
  1141  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1142  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1143  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1144  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1145  
  1146  GLOBL masks<>(SB),RODATA,$256
  1147  
  1148  // these are arguments to pshufb.  They move data down from
  1149  // the high bytes of the register to the low bytes of the register.
  1150  // index is how many bytes to move.
  1151  DATA shifts<>+0x00(SB)/4, $0x00000000
  1152  DATA shifts<>+0x04(SB)/4, $0x00000000
  1153  DATA shifts<>+0x08(SB)/4, $0x00000000
  1154  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1155  	
  1156  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1157  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1158  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1159  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1160  	
  1161  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1162  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1163  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1164  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1165  	
  1166  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1167  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1168  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1169  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1170  	
  1171  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1172  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1173  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1174  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1175  	
  1176  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1177  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1178  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1179  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1180  	
  1181  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1182  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1183  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1184  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1185  	
  1186  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1187  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1188  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1189  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1190  	
  1191  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1192  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1193  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1194  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1195  	
  1196  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1197  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1198  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1199  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1200  	
  1201  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1202  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1203  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1204  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1205  	
  1206  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1207  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1208  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1209  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1210  	
  1211  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1212  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1213  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1214  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1215  	
  1216  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1217  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1218  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1219  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1220  	
  1221  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1222  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1223  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1224  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1225  	
  1226  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1227  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1228  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1229  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1230  
  1231  GLOBL shifts<>(SB),RODATA,$256
  1232  
  1233  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1234  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1235  	MOVL	$masks<>(SB), AX
  1236  	MOVL	$shifts<>(SB), BX
  1237  	ORL	BX, AX
  1238  	TESTL	$15, AX
  1239  	SETEQ	ret+0(FP)
  1240  	RET
  1241  
  1242  TEXT runtime·memeq(SB),NOSPLIT,$0-13
  1243  	MOVL	a+0(FP), SI
  1244  	MOVL	b+4(FP), DI
  1245  	MOVL	size+8(FP), BX
  1246  	LEAL	ret+12(FP), AX
  1247  	JMP	runtime·memeqbody(SB)
  1248  
  1249  // memequal_varlen(a, b unsafe.Pointer) bool
  1250  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1251  	MOVL    a+0(FP), SI
  1252  	MOVL    b+4(FP), DI
  1253  	CMPL    SI, DI
  1254  	JEQ     eq
  1255  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1256  	LEAL	ret+8(FP), AX
  1257  	JMP	runtime·memeqbody(SB)
  1258  eq:
  1259  	MOVB    $1, ret+8(FP)
  1260  	RET
  1261  
  1262  // eqstring tests whether two strings are equal.
  1263  // The compiler guarantees that strings passed
  1264  // to eqstring have equal length.
  1265  // See runtime_test.go:eqstring_generic for
  1266  // equivalent Go code.
  1267  TEXT runtime·eqstring(SB),NOSPLIT,$0-17
  1268  	MOVL	s1str+0(FP), SI
  1269  	MOVL	s2str+8(FP), DI
  1270  	CMPL	SI, DI
  1271  	JEQ	same
  1272  	MOVL	s1len+4(FP), BX
  1273  	LEAL	v+16(FP), AX
  1274  	JMP	runtime·memeqbody(SB)
  1275  same:
  1276  	MOVB	$1, v+16(FP)
  1277  	RET
  1278  
  1279  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1280  	MOVL	a_len+4(FP), BX
  1281  	MOVL	b_len+16(FP), CX
  1282  	CMPL	BX, CX
  1283  	JNE	eqret
  1284  	MOVL	a+0(FP), SI
  1285  	MOVL	b+12(FP), DI
  1286  	LEAL	ret+24(FP), AX
  1287  	JMP	runtime·memeqbody(SB)
  1288  eqret:
  1289  	MOVB	$0, ret+24(FP)
  1290  	RET
  1291  
  1292  // a in SI
  1293  // b in DI
  1294  // count in BX
  1295  // address of result byte in AX
  1296  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1297  	CMPL	BX, $4
  1298  	JB	small
  1299  
  1300  	// 64 bytes at a time using xmm registers
  1301  hugeloop:
  1302  	CMPL	BX, $64
  1303  	JB	bigloop
  1304  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1305  	JE	bigloop
  1306  	MOVOU	(SI), X0
  1307  	MOVOU	(DI), X1
  1308  	MOVOU	16(SI), X2
  1309  	MOVOU	16(DI), X3
  1310  	MOVOU	32(SI), X4
  1311  	MOVOU	32(DI), X5
  1312  	MOVOU	48(SI), X6
  1313  	MOVOU	48(DI), X7
  1314  	PCMPEQB	X1, X0
  1315  	PCMPEQB	X3, X2
  1316  	PCMPEQB	X5, X4
  1317  	PCMPEQB	X7, X6
  1318  	PAND	X2, X0
  1319  	PAND	X6, X4
  1320  	PAND	X4, X0
  1321  	PMOVMSKB X0, DX
  1322  	ADDL	$64, SI
  1323  	ADDL	$64, DI
  1324  	SUBL	$64, BX
  1325  	CMPL	DX, $0xffff
  1326  	JEQ	hugeloop
  1327  	MOVB	$0, (AX)
  1328  	RET
  1329  
  1330  	// 4 bytes at a time using 32-bit register
  1331  bigloop:
  1332  	CMPL	BX, $4
  1333  	JBE	leftover
  1334  	MOVL	(SI), CX
  1335  	MOVL	(DI), DX
  1336  	ADDL	$4, SI
  1337  	ADDL	$4, DI
  1338  	SUBL	$4, BX
  1339  	CMPL	CX, DX
  1340  	JEQ	bigloop
  1341  	MOVB	$0, (AX)
  1342  	RET
  1343  
  1344  	// remaining 0-4 bytes
  1345  leftover:
  1346  	MOVL	-4(SI)(BX*1), CX
  1347  	MOVL	-4(DI)(BX*1), DX
  1348  	CMPL	CX, DX
  1349  	SETEQ	(AX)
  1350  	RET
  1351  
  1352  small:
  1353  	CMPL	BX, $0
  1354  	JEQ	equal
  1355  
  1356  	LEAL	0(BX*8), CX
  1357  	NEGL	CX
  1358  
  1359  	MOVL	SI, DX
  1360  	CMPB	DX, $0xfc
  1361  	JA	si_high
  1362  
  1363  	// load at SI won't cross a page boundary.
  1364  	MOVL	(SI), SI
  1365  	JMP	si_finish
  1366  si_high:
  1367  	// address ends in 111111xx.  Load up to bytes we want, move to correct position.
  1368  	MOVL	-4(SI)(BX*1), SI
  1369  	SHRL	CX, SI
  1370  si_finish:
  1371  
  1372  	// same for DI.
  1373  	MOVL	DI, DX
  1374  	CMPB	DX, $0xfc
  1375  	JA	di_high
  1376  	MOVL	(DI), DI
  1377  	JMP	di_finish
  1378  di_high:
  1379  	MOVL	-4(DI)(BX*1), DI
  1380  	SHRL	CX, DI
  1381  di_finish:
  1382  
  1383  	SUBL	SI, DI
  1384  	SHLL	CX, DI
  1385  equal:
  1386  	SETEQ	(AX)
  1387  	RET
  1388  
  1389  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1390  	MOVL	s1_base+0(FP), SI
  1391  	MOVL	s1_len+4(FP), BX
  1392  	MOVL	s2_base+8(FP), DI
  1393  	MOVL	s2_len+12(FP), DX
  1394  	LEAL	ret+16(FP), AX
  1395  	JMP	runtime·cmpbody(SB)
  1396  
  1397  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1398  	MOVL	s1+0(FP), SI
  1399  	MOVL	s1+4(FP), BX
  1400  	MOVL	s2+12(FP), DI
  1401  	MOVL	s2+16(FP), DX
  1402  	LEAL	ret+24(FP), AX
  1403  	JMP	runtime·cmpbody(SB)
  1404  
  1405  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1406  	MOVL	s+0(FP), SI
  1407  	MOVL	s_len+4(FP), CX
  1408  	MOVB	c+12(FP), AL
  1409  	MOVL	SI, DI
  1410  	CLD; REPN; SCASB
  1411  	JZ 3(PC)
  1412  	MOVL	$-1, ret+16(FP)
  1413  	RET
  1414  	SUBL	SI, DI
  1415  	SUBL	$1, DI
  1416  	MOVL	DI, ret+16(FP)
  1417  	RET
  1418  
  1419  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1420  	MOVL	s+0(FP), SI
  1421  	MOVL	s_len+4(FP), CX
  1422  	MOVB	c+8(FP), AL
  1423  	MOVL	SI, DI
  1424  	CLD; REPN; SCASB
  1425  	JZ 3(PC)
  1426  	MOVL	$-1, ret+12(FP)
  1427  	RET
  1428  	SUBL	SI, DI
  1429  	SUBL	$1, DI
  1430  	MOVL	DI, ret+12(FP)
  1431  	RET
  1432  
  1433  // input:
  1434  //   SI = a
  1435  //   DI = b
  1436  //   BX = alen
  1437  //   DX = blen
  1438  //   AX = address of return word (set to 1/0/-1)
  1439  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1440  	MOVL	DX, BP
  1441  	SUBL	BX, DX // DX = blen-alen
  1442  	CMOVLGT	BX, BP // BP = min(alen, blen)
  1443  	CMPL	SI, DI
  1444  	JEQ	allsame
  1445  	CMPL	BP, $4
  1446  	JB	small
  1447  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1448  	JE	mediumloop
  1449  largeloop:
  1450  	CMPL	BP, $16
  1451  	JB	mediumloop
  1452  	MOVOU	(SI), X0
  1453  	MOVOU	(DI), X1
  1454  	PCMPEQB X0, X1
  1455  	PMOVMSKB X1, BX
  1456  	XORL	$0xffff, BX	// convert EQ to NE
  1457  	JNE	diff16	// branch if at least one byte is not equal
  1458  	ADDL	$16, SI
  1459  	ADDL	$16, DI
  1460  	SUBL	$16, BP
  1461  	JMP	largeloop
  1462  
  1463  diff16:
  1464  	BSFL	BX, BX	// index of first byte that differs
  1465  	XORL	DX, DX
  1466  	MOVB	(SI)(BX*1), CX
  1467  	CMPB	CX, (DI)(BX*1)
  1468  	SETHI	DX
  1469  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1470  	MOVL	DX, (AX)
  1471  	RET
  1472  
  1473  mediumloop:
  1474  	CMPL	BP, $4
  1475  	JBE	_0through4
  1476  	MOVL	(SI), BX
  1477  	MOVL	(DI), CX
  1478  	CMPL	BX, CX
  1479  	JNE	diff4
  1480  	ADDL	$4, SI
  1481  	ADDL	$4, DI
  1482  	SUBL	$4, BP
  1483  	JMP	mediumloop
  1484  
  1485  _0through4:
  1486  	MOVL	-4(SI)(BP*1), BX
  1487  	MOVL	-4(DI)(BP*1), CX
  1488  	CMPL	BX, CX
  1489  	JEQ	allsame
  1490  
  1491  diff4:
  1492  	BSWAPL	BX	// reverse order of bytes
  1493  	BSWAPL	CX
  1494  	XORL	BX, CX	// find bit differences
  1495  	BSRL	CX, CX	// index of highest bit difference
  1496  	SHRL	CX, BX	// move a's bit to bottom
  1497  	ANDL	$1, BX	// mask bit
  1498  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1499  	MOVL	BX, (AX)
  1500  	RET
  1501  
  1502  	// 0-3 bytes in common
  1503  small:
  1504  	LEAL	(BP*8), CX
  1505  	NEGL	CX
  1506  	JEQ	allsame
  1507  
  1508  	// load si
  1509  	CMPB	SI, $0xfc
  1510  	JA	si_high
  1511  	MOVL	(SI), SI
  1512  	JMP	si_finish
  1513  si_high:
  1514  	MOVL	-4(SI)(BP*1), SI
  1515  	SHRL	CX, SI
  1516  si_finish:
  1517  	SHLL	CX, SI
  1518  
  1519  	// same for di
  1520  	CMPB	DI, $0xfc
  1521  	JA	di_high
  1522  	MOVL	(DI), DI
  1523  	JMP	di_finish
  1524  di_high:
  1525  	MOVL	-4(DI)(BP*1), DI
  1526  	SHRL	CX, DI
  1527  di_finish:
  1528  	SHLL	CX, DI
  1529  
  1530  	BSWAPL	SI	// reverse order of bytes
  1531  	BSWAPL	DI
  1532  	XORL	SI, DI	// find bit differences
  1533  	JEQ	allsame
  1534  	BSRL	DI, CX	// index of highest bit difference
  1535  	SHRL	CX, SI	// move a's bit to bottom
  1536  	ANDL	$1, SI	// mask bit
  1537  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1538  	MOVL	BX, (AX)
  1539  	RET
  1540  
  1541  	// all the bytes in common are the same, so we just need
  1542  	// to compare the lengths.
  1543  allsame:
  1544  	XORL	BX, BX
  1545  	XORL	CX, CX
  1546  	TESTL	DX, DX
  1547  	SETLT	BX	// 1 if alen > blen
  1548  	SETEQ	CX	// 1 if alen == blen
  1549  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1550  	MOVL	BX, (AX)
  1551  	RET
  1552  
  1553  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1554  	get_tls(CX)
  1555  	MOVL	g(CX), AX
  1556  	MOVL	g_m(AX), AX
  1557  	MOVL	m_fastrand(AX), DX
  1558  	ADDL	DX, DX
  1559  	MOVL	DX, BX
  1560  	XORL	$0x88888eef, DX
  1561  	CMOVLMI	BX, DX
  1562  	MOVL	DX, m_fastrand(AX)
  1563  	MOVL	DX, ret+0(FP)
  1564  	RET
  1565  
  1566  TEXT runtime·return0(SB), NOSPLIT, $0
  1567  	MOVL	$0, AX
  1568  	RET
  1569  
  1570  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1571  // Must obey the gcc calling convention.
  1572  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1573  	get_tls(CX)
  1574  	MOVL	g(CX), AX
  1575  	MOVL	g_m(AX), AX
  1576  	MOVL	m_curg(AX), AX
  1577  	MOVL	(g_stack+stack_hi)(AX), AX
  1578  	RET
  1579  
  1580  // The top-most function running on a goroutine
  1581  // returns to goexit+PCQuantum.
  1582  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1583  	BYTE	$0x90	// NOP
  1584  	CALL	runtime·goexit1(SB)	// does not return
  1585  	// traceback from goexit1 must hit code range of goexit
  1586  	BYTE	$0x90	// NOP
  1587  
  1588  // Prefetching doesn't seem to help.
  1589  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
  1590  	RET
  1591  
  1592  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
  1593  	RET
  1594  
  1595  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
  1596  	RET
  1597  
  1598  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
  1599  	RET
  1600  
  1601  // Add a module's moduledata to the linked list of moduledata objects.  This
  1602  // is called from .init_array by a function generated in the linker and so
  1603  // follows the platform ABI wrt register preservation -- it only touches AX,
  1604  // CX (implicitly) and DX, but it does not follow the ABI wrt arguments:
  1605  // instead the pointer to the moduledata is passed in AX.
  1606  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1607         MOVL    runtime·lastmoduledatap(SB), DX
  1608         MOVL    AX, moduledata_next(DX)
  1609         MOVL    AX, runtime·lastmoduledatap(SB)
  1610         RET