github.com/tidwall/go@v0.0.0-20170415222209-6694a6888b7d/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	SUBL	$128, SP		// plenty of scratch
    15  	ANDL	$~15, SP
    16  	MOVL	AX, 120(SP)		// save argc, argv away
    17  	MOVL	BX, 124(SP)
    18  
    19  	// set default stack bounds.
    20  	// _cgo_init may update stackguard.
    21  	MOVL	$runtime·g0(SB), BP
    22  	LEAL	(-64*1024+104)(SP), BX
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	BX, g_stackguard1(BP)
    25  	MOVL	BX, (g_stack+stack_lo)(BP)
    26  	MOVL	SP, (g_stack+stack_hi)(BP)
    27  	
    28  	// find out information about the processor we're on
    29  #ifdef GOOS_nacl // NaCl doesn't like PUSHFL/POPFL
    30  	JMP 	has_cpuid
    31  #else
    32  	// first see if CPUID instruction is supported.
    33  	PUSHFL
    34  	PUSHFL
    35  	XORL	$(1<<21), 0(SP) // flip ID bit
    36  	POPFL
    37  	PUSHFL
    38  	POPL	AX
    39  	XORL	0(SP), AX
    40  	POPFL	// restore EFLAGS
    41  	TESTL	$(1<<21), AX
    42  	JNE 	has_cpuid
    43  #endif
    44  
    45  bad_proc: // show that the program requires MMX.
    46  	MOVL	$2, 0(SP)
    47  	MOVL	$bad_proc_msg<>(SB), 4(SP)
    48  	MOVL	$0x3d, 8(SP)
    49  	CALL	runtime·write(SB)
    50  	MOVL	$1, 0(SP)
    51  	CALL	runtime·exit(SB)
    52  	INT	$3
    53  
    54  has_cpuid:
    55  	MOVL	$0, AX
    56  	CPUID
    57  	MOVL	AX, SI
    58  	CMPL	AX, $0
    59  	JE	nocpuinfo
    60  
    61  	// Figure out how to serialize RDTSC.
    62  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    63  	// Don't know about the rest, so let's do MFENCE.
    64  	CMPL	BX, $0x756E6547  // "Genu"
    65  	JNE	notintel
    66  	CMPL	DX, $0x49656E69  // "ineI"
    67  	JNE	notintel
    68  	CMPL	CX, $0x6C65746E  // "ntel"
    69  	JNE	notintel
    70  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    71  notintel:
    72  
    73  	// Load EAX=1 cpuid flags
    74  	MOVL	$1, AX
    75  	CPUID
    76  	MOVL	CX, AX // Move to global variable clobbers CX when generating PIC
    77  	MOVL	AX, runtime·cpuid_ecx(SB)
    78  	MOVL	DX, runtime·cpuid_edx(SB)
    79  
    80  	// Check for MMX support
    81  	TESTL	$(1<<23), DX	// MMX
    82  	JZ 	bad_proc
    83  
    84  	// Load EAX=7/ECX=0 cpuid flags
    85  	CMPL	SI, $7
    86  	JLT	nocpuinfo
    87  	MOVL	$7, AX
    88  	MOVL	$0, CX
    89  	CPUID
    90  	MOVL	BX, runtime·cpuid_ebx7(SB)
    91  
    92  nocpuinfo:	
    93  
    94  	// if there is an _cgo_init, call it to let it
    95  	// initialize and to set up GS.  if not,
    96  	// we set up GS ourselves.
    97  	MOVL	_cgo_init(SB), AX
    98  	TESTL	AX, AX
    99  	JZ	needtls
   100  	MOVL	$setg_gcc<>(SB), BX
   101  	MOVL	BX, 4(SP)
   102  	MOVL	BP, 0(SP)
   103  	CALL	AX
   104  
   105  	// update stackguard after _cgo_init
   106  	MOVL	$runtime·g0(SB), CX
   107  	MOVL	(g_stack+stack_lo)(CX), AX
   108  	ADDL	$const__StackGuard, AX
   109  	MOVL	AX, g_stackguard0(CX)
   110  	MOVL	AX, g_stackguard1(CX)
   111  
   112  #ifndef GOOS_windows
   113  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
   114  	JMP ok
   115  #endif
   116  needtls:
   117  #ifdef GOOS_plan9
   118  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
   119  	JMP	ok
   120  #endif
   121  
   122  	// set up %gs
   123  	CALL	runtime·ldt0setup(SB)
   124  
   125  	// store through it, to make sure it works
   126  	get_tls(BX)
   127  	MOVL	$0x123, g(BX)
   128  	MOVL	runtime·m0+m_tls(SB), AX
   129  	CMPL	AX, $0x123
   130  	JEQ	ok
   131  	MOVL	AX, 0	// abort
   132  ok:
   133  	// set up m and g "registers"
   134  	get_tls(BX)
   135  	LEAL	runtime·g0(SB), DX
   136  	MOVL	DX, g(BX)
   137  	LEAL	runtime·m0(SB), AX
   138  
   139  	// save m->g0 = g0
   140  	MOVL	DX, m_g0(AX)
   141  	// save g0->m = m0
   142  	MOVL	AX, g_m(DX)
   143  
   144  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   145  
   146  	// convention is D is always cleared
   147  	CLD
   148  
   149  	CALL	runtime·check(SB)
   150  
   151  	// saved argc, argv
   152  	MOVL	120(SP), AX
   153  	MOVL	AX, 0(SP)
   154  	MOVL	124(SP), AX
   155  	MOVL	AX, 4(SP)
   156  	CALL	runtime·args(SB)
   157  	CALL	runtime·osinit(SB)
   158  	CALL	runtime·schedinit(SB)
   159  
   160  	// create a new goroutine to start program
   161  	PUSHL	$runtime·mainPC(SB)	// entry
   162  	PUSHL	$0	// arg size
   163  	CALL	runtime·newproc(SB)
   164  	POPL	AX
   165  	POPL	AX
   166  
   167  	// start this M
   168  	CALL	runtime·mstart(SB)
   169  
   170  	INT $3
   171  	RET
   172  
   173  DATA	bad_proc_msg<>+0x00(SB)/8, $"This pro"
   174  DATA	bad_proc_msg<>+0x08(SB)/8, $"gram can"
   175  DATA	bad_proc_msg<>+0x10(SB)/8, $" only be"
   176  DATA	bad_proc_msg<>+0x18(SB)/8, $" run on "
   177  DATA	bad_proc_msg<>+0x20(SB)/8, $"processo"
   178  DATA	bad_proc_msg<>+0x28(SB)/8, $"rs with "
   179  DATA	bad_proc_msg<>+0x30(SB)/8, $"MMX supp"
   180  DATA	bad_proc_msg<>+0x38(SB)/4, $"ort."
   181  DATA	bad_proc_msg<>+0x3c(SB)/1, $0xa
   182  GLOBL	bad_proc_msg<>(SB), RODATA, $0x3d
   183  
   184  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   185  GLOBL	runtime·mainPC(SB),RODATA,$4
   186  
   187  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   188  	INT $3
   189  	RET
   190  
   191  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   192  	// Linux and MinGW start the FPU in extended double precision.
   193  	// Other operating systems use double precision.
   194  	// Change to double precision to match them,
   195  	// and to match other hardware that only has double.
   196  	FLDCW	runtime·controlWord64(SB)
   197  	RET
   198  
   199  /*
   200   *  go-routine
   201   */
   202  
   203  // void gosave(Gobuf*)
   204  // save state in Gobuf; setjmp
   205  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   206  	MOVL	buf+0(FP), AX		// gobuf
   207  	LEAL	buf+0(FP), BX		// caller's SP
   208  	MOVL	BX, gobuf_sp(AX)
   209  	MOVL	0(SP), BX		// caller's PC
   210  	MOVL	BX, gobuf_pc(AX)
   211  	MOVL	$0, gobuf_ret(AX)
   212  	// Assert ctxt is zero. See func save.
   213  	MOVL	gobuf_ctxt(AX), BX
   214  	TESTL	BX, BX
   215  	JZ	2(PC)
   216  	CALL	runtime·badctxt(SB)
   217  	get_tls(CX)
   218  	MOVL	g(CX), BX
   219  	MOVL	BX, gobuf_g(AX)
   220  	RET
   221  
   222  // void gogo(Gobuf*)
   223  // restore state from Gobuf; longjmp
   224  TEXT runtime·gogo(SB), NOSPLIT, $8-4
   225  	MOVL	buf+0(FP), BX		// gobuf
   226  
   227  	// If ctxt is not nil, invoke deletion barrier before overwriting.
   228  	MOVL	gobuf_ctxt(BX), DX
   229  	TESTL	DX, DX
   230  	JZ	nilctxt
   231  	LEAL	gobuf_ctxt(BX), AX
   232  	MOVL	AX, 0(SP)
   233  	MOVL	$0, 4(SP)
   234  	CALL	runtime·writebarrierptr_prewrite(SB)
   235  	MOVL	buf+0(FP), BX
   236  
   237  nilctxt:
   238  	MOVL	gobuf_g(BX), DX
   239  	MOVL	0(DX), CX		// make sure g != nil
   240  	get_tls(CX)
   241  	MOVL	DX, g(CX)
   242  	MOVL	gobuf_sp(BX), SP	// restore SP
   243  	MOVL	gobuf_ret(BX), AX
   244  	MOVL	gobuf_ctxt(BX), DX
   245  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   246  	MOVL	$0, gobuf_ret(BX)
   247  	MOVL	$0, gobuf_ctxt(BX)
   248  	MOVL	gobuf_pc(BX), BX
   249  	JMP	BX
   250  
   251  // func mcall(fn func(*g))
   252  // Switch to m->g0's stack, call fn(g).
   253  // Fn must never return. It should gogo(&g->sched)
   254  // to keep running g.
   255  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   256  	MOVL	fn+0(FP), DI
   257  
   258  	get_tls(DX)
   259  	MOVL	g(DX), AX	// save state in g->sched
   260  	MOVL	0(SP), BX	// caller's PC
   261  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   262  	LEAL	fn+0(FP), BX	// caller's SP
   263  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   264  	MOVL	AX, (g_sched+gobuf_g)(AX)
   265  
   266  	// switch to m->g0 & its stack, call fn
   267  	MOVL	g(DX), BX
   268  	MOVL	g_m(BX), BX
   269  	MOVL	m_g0(BX), SI
   270  	CMPL	SI, AX	// if g == m->g0 call badmcall
   271  	JNE	3(PC)
   272  	MOVL	$runtime·badmcall(SB), AX
   273  	JMP	AX
   274  	MOVL	SI, g(DX)	// g = m->g0
   275  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   276  	PUSHL	AX
   277  	MOVL	DI, DX
   278  	MOVL	0(DI), DI
   279  	CALL	DI
   280  	POPL	AX
   281  	MOVL	$runtime·badmcall2(SB), AX
   282  	JMP	AX
   283  	RET
   284  
   285  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   286  // of the G stack. We need to distinguish the routine that
   287  // lives at the bottom of the G stack from the one that lives
   288  // at the top of the system stack because the one at the top of
   289  // the system stack terminates the stack walk (see topofstack()).
   290  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   291  	RET
   292  
   293  // func systemstack(fn func())
   294  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   295  	MOVL	fn+0(FP), DI	// DI = fn
   296  	get_tls(CX)
   297  	MOVL	g(CX), AX	// AX = g
   298  	MOVL	g_m(AX), BX	// BX = m
   299  
   300  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   301  	CMPL	AX, DX
   302  	JEQ	noswitch
   303  
   304  	MOVL	m_g0(BX), DX	// DX = g0
   305  	CMPL	AX, DX
   306  	JEQ	noswitch
   307  
   308  	MOVL	m_curg(BX), BP
   309  	CMPL	AX, BP
   310  	JEQ	switch
   311  	
   312  	// Bad: g is not gsignal, not g0, not curg. What is it?
   313  	// Hide call from linker nosplit analysis.
   314  	MOVL	$runtime·badsystemstack(SB), AX
   315  	CALL	AX
   316  
   317  switch:
   318  	// save our state in g->sched. Pretend to
   319  	// be systemstack_switch if the G stack is scanned.
   320  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   321  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   322  	MOVL	AX, (g_sched+gobuf_g)(AX)
   323  
   324  	// switch to g0
   325  	get_tls(CX)
   326  	MOVL	DX, g(CX)
   327  	MOVL	(g_sched+gobuf_sp)(DX), BX
   328  	// make it look like mstart called systemstack on g0, to stop traceback
   329  	SUBL	$4, BX
   330  	MOVL	$runtime·mstart(SB), DX
   331  	MOVL	DX, 0(BX)
   332  	MOVL	BX, SP
   333  
   334  	// call target function
   335  	MOVL	DI, DX
   336  	MOVL	0(DI), DI
   337  	CALL	DI
   338  
   339  	// switch back to g
   340  	get_tls(CX)
   341  	MOVL	g(CX), AX
   342  	MOVL	g_m(AX), BX
   343  	MOVL	m_curg(BX), AX
   344  	MOVL	AX, g(CX)
   345  	MOVL	(g_sched+gobuf_sp)(AX), SP
   346  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   347  	RET
   348  
   349  noswitch:
   350  	// already on system stack, just call directly
   351  	MOVL	DI, DX
   352  	MOVL	0(DI), DI
   353  	CALL	DI
   354  	RET
   355  
   356  /*
   357   * support for morestack
   358   */
   359  
   360  // Called during function prolog when more stack is needed.
   361  //
   362  // The traceback routines see morestack on a g0 as being
   363  // the top of a stack (for example, morestack calling newstack
   364  // calling the scheduler calling newm calling gc), so we must
   365  // record an argument size. For that purpose, it has no arguments.
   366  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   367  	// Cannot grow scheduler stack (m->g0).
   368  	get_tls(CX)
   369  	MOVL	g(CX), BX
   370  	MOVL	g_m(BX), BX
   371  	MOVL	m_g0(BX), SI
   372  	CMPL	g(CX), SI
   373  	JNE	3(PC)
   374  	CALL	runtime·badmorestackg0(SB)
   375  	INT	$3
   376  
   377  	// Cannot grow signal stack.
   378  	MOVL	m_gsignal(BX), SI
   379  	CMPL	g(CX), SI
   380  	JNE	3(PC)
   381  	CALL	runtime·badmorestackgsignal(SB)
   382  	INT	$3
   383  
   384  	// Called from f.
   385  	// Set m->morebuf to f's caller.
   386  	MOVL	4(SP), DI	// f's caller's PC
   387  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   388  	LEAL	8(SP), CX	// f's caller's SP
   389  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   390  	get_tls(CX)
   391  	MOVL	g(CX), SI
   392  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   393  
   394  	// Set g->sched to context in f.
   395  	MOVL	0(SP), AX	// f's PC
   396  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   397  	MOVL	SI, (g_sched+gobuf_g)(SI)
   398  	LEAL	4(SP), AX	// f's SP
   399  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   400  	// newstack will fill gobuf.ctxt.
   401  
   402  	// Call newstack on m->g0's stack.
   403  	MOVL	m_g0(BX), BP
   404  	MOVL	BP, g(CX)
   405  	MOVL	(g_sched+gobuf_sp)(BP), AX
   406  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   407  	MOVL	AX, SP
   408  	PUSHL	DX	// ctxt argument
   409  	CALL	runtime·newstack(SB)
   410  	MOVL	$0, 0x1003	// crash if newstack returns
   411  	POPL	DX	// keep balance check happy
   412  	RET
   413  
   414  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   415  	MOVL	$0, DX
   416  	JMP runtime·morestack(SB)
   417  
   418  // reflectcall: call a function with the given argument list
   419  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   420  // we don't have variable-sized frames, so we use a small number
   421  // of constant-sized-frame functions to encode a few bits of size in the pc.
   422  // Caution: ugly multiline assembly macros in your future!
   423  
   424  #define DISPATCH(NAME,MAXSIZE)		\
   425  	CMPL	CX, $MAXSIZE;		\
   426  	JA	3(PC);			\
   427  	MOVL	$NAME(SB), AX;		\
   428  	JMP	AX
   429  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   430  
   431  TEXT reflect·call(SB), NOSPLIT, $0-0
   432  	JMP	·reflectcall(SB)
   433  
   434  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   435  	MOVL	argsize+12(FP), CX
   436  	DISPATCH(runtime·call16, 16)
   437  	DISPATCH(runtime·call32, 32)
   438  	DISPATCH(runtime·call64, 64)
   439  	DISPATCH(runtime·call128, 128)
   440  	DISPATCH(runtime·call256, 256)
   441  	DISPATCH(runtime·call512, 512)
   442  	DISPATCH(runtime·call1024, 1024)
   443  	DISPATCH(runtime·call2048, 2048)
   444  	DISPATCH(runtime·call4096, 4096)
   445  	DISPATCH(runtime·call8192, 8192)
   446  	DISPATCH(runtime·call16384, 16384)
   447  	DISPATCH(runtime·call32768, 32768)
   448  	DISPATCH(runtime·call65536, 65536)
   449  	DISPATCH(runtime·call131072, 131072)
   450  	DISPATCH(runtime·call262144, 262144)
   451  	DISPATCH(runtime·call524288, 524288)
   452  	DISPATCH(runtime·call1048576, 1048576)
   453  	DISPATCH(runtime·call2097152, 2097152)
   454  	DISPATCH(runtime·call4194304, 4194304)
   455  	DISPATCH(runtime·call8388608, 8388608)
   456  	DISPATCH(runtime·call16777216, 16777216)
   457  	DISPATCH(runtime·call33554432, 33554432)
   458  	DISPATCH(runtime·call67108864, 67108864)
   459  	DISPATCH(runtime·call134217728, 134217728)
   460  	DISPATCH(runtime·call268435456, 268435456)
   461  	DISPATCH(runtime·call536870912, 536870912)
   462  	DISPATCH(runtime·call1073741824, 1073741824)
   463  	MOVL	$runtime·badreflectcall(SB), AX
   464  	JMP	AX
   465  
   466  #define CALLFN(NAME,MAXSIZE)			\
   467  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   468  	NO_LOCAL_POINTERS;			\
   469  	/* copy arguments to stack */		\
   470  	MOVL	argptr+8(FP), SI;		\
   471  	MOVL	argsize+12(FP), CX;		\
   472  	MOVL	SP, DI;				\
   473  	REP;MOVSB;				\
   474  	/* call function */			\
   475  	MOVL	f+4(FP), DX;			\
   476  	MOVL	(DX), AX; 			\
   477  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   478  	CALL	AX;				\
   479  	/* copy return values back */		\
   480  	MOVL	argtype+0(FP), DX;		\
   481  	MOVL	argptr+8(FP), DI;		\
   482  	MOVL	argsize+12(FP), CX;		\
   483  	MOVL	retoffset+16(FP), BX;		\
   484  	MOVL	SP, SI;				\
   485  	ADDL	BX, DI;				\
   486  	ADDL	BX, SI;				\
   487  	SUBL	BX, CX;				\
   488  	CALL	callRet<>(SB);			\
   489  	RET
   490  
   491  // callRet copies return values back at the end of call*. This is a
   492  // separate function so it can allocate stack space for the arguments
   493  // to reflectcallmove. It does not follow the Go ABI; it expects its
   494  // arguments in registers.
   495  TEXT callRet<>(SB), NOSPLIT, $16-0
   496  	MOVL	DX, 0(SP)
   497  	MOVL	DI, 4(SP)
   498  	MOVL	SI, 8(SP)
   499  	MOVL	CX, 12(SP)
   500  	CALL	runtime·reflectcallmove(SB)
   501  	RET
   502  
   503  CALLFN(·call16, 16)
   504  CALLFN(·call32, 32)
   505  CALLFN(·call64, 64)
   506  CALLFN(·call128, 128)
   507  CALLFN(·call256, 256)
   508  CALLFN(·call512, 512)
   509  CALLFN(·call1024, 1024)
   510  CALLFN(·call2048, 2048)
   511  CALLFN(·call4096, 4096)
   512  CALLFN(·call8192, 8192)
   513  CALLFN(·call16384, 16384)
   514  CALLFN(·call32768, 32768)
   515  CALLFN(·call65536, 65536)
   516  CALLFN(·call131072, 131072)
   517  CALLFN(·call262144, 262144)
   518  CALLFN(·call524288, 524288)
   519  CALLFN(·call1048576, 1048576)
   520  CALLFN(·call2097152, 2097152)
   521  CALLFN(·call4194304, 4194304)
   522  CALLFN(·call8388608, 8388608)
   523  CALLFN(·call16777216, 16777216)
   524  CALLFN(·call33554432, 33554432)
   525  CALLFN(·call67108864, 67108864)
   526  CALLFN(·call134217728, 134217728)
   527  CALLFN(·call268435456, 268435456)
   528  CALLFN(·call536870912, 536870912)
   529  CALLFN(·call1073741824, 1073741824)
   530  
   531  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   532  	MOVL	cycles+0(FP), AX
   533  again:
   534  	PAUSE
   535  	SUBL	$1, AX
   536  	JNZ	again
   537  	RET
   538  
   539  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   540  	// Stores are already ordered on x86, so this is just a
   541  	// compile barrier.
   542  	RET
   543  
   544  // void jmpdefer(fn, sp);
   545  // called from deferreturn.
   546  // 1. pop the caller
   547  // 2. sub 5 bytes (the length of CALL & a 32 bit displacement) from the callers
   548  //    return (when building for shared libraries, subtract 16 bytes -- 5 bytes
   549  //    for CALL & displacement to call __x86.get_pc_thunk.cx, 6 bytes for the
   550  //    LEAL to load the offset into BX, and finally 5 for the call & displacement)
   551  // 3. jmp to the argument
   552  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   553  	MOVL	fv+0(FP), DX	// fn
   554  	MOVL	argp+4(FP), BX	// caller sp
   555  	LEAL	-4(BX), SP	// caller sp after CALL
   556  #ifdef GOBUILDMODE_shared
   557  	SUBL	$16, (SP)	// return to CALL again
   558  #else
   559  	SUBL	$5, (SP)	// return to CALL again
   560  #endif
   561  	MOVL	0(DX), BX
   562  	JMP	BX	// but first run the deferred function
   563  
   564  // Save state of caller into g->sched.
   565  TEXT gosave<>(SB),NOSPLIT,$0
   566  	PUSHL	AX
   567  	PUSHL	BX
   568  	get_tls(BX)
   569  	MOVL	g(BX), BX
   570  	LEAL	arg+0(FP), AX
   571  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   572  	MOVL	-4(AX), AX
   573  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   574  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   575  	// Assert ctxt is zero. See func save.
   576  	MOVL	(g_sched+gobuf_ctxt)(BX), AX
   577  	TESTL	AX, AX
   578  	JZ	2(PC)
   579  	CALL	runtime·badctxt(SB)
   580  	POPL	BX
   581  	POPL	AX
   582  	RET
   583  
   584  // func asmcgocall(fn, arg unsafe.Pointer) int32
   585  // Call fn(arg) on the scheduler stack,
   586  // aligned appropriately for the gcc ABI.
   587  // See cgocall.go for more details.
   588  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   589  	MOVL	fn+0(FP), AX
   590  	MOVL	arg+4(FP), BX
   591  
   592  	MOVL	SP, DX
   593  
   594  	// Figure out if we need to switch to m->g0 stack.
   595  	// We get called to create new OS threads too, and those
   596  	// come in on the m->g0 stack already.
   597  	get_tls(CX)
   598  	MOVL	g(CX), BP
   599  	MOVL	g_m(BP), BP
   600  	MOVL	m_g0(BP), SI
   601  	MOVL	g(CX), DI
   602  	CMPL	SI, DI
   603  	JEQ	noswitch
   604  	CALL	gosave<>(SB)
   605  	get_tls(CX)
   606  	MOVL	SI, g(CX)
   607  	MOVL	(g_sched+gobuf_sp)(SI), SP
   608  
   609  noswitch:
   610  	// Now on a scheduling stack (a pthread-created stack).
   611  	SUBL	$32, SP
   612  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   613  	MOVL	DI, 8(SP)	// save g
   614  	MOVL	(g_stack+stack_hi)(DI), DI
   615  	SUBL	DX, DI
   616  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   617  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   618  	CALL	AX
   619  
   620  	// Restore registers, g, stack pointer.
   621  	get_tls(CX)
   622  	MOVL	8(SP), DI
   623  	MOVL	(g_stack+stack_hi)(DI), SI
   624  	SUBL	4(SP), SI
   625  	MOVL	DI, g(CX)
   626  	MOVL	SI, SP
   627  
   628  	MOVL	AX, ret+8(FP)
   629  	RET
   630  
   631  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   632  // Turn the fn into a Go func (by taking its address) and call
   633  // cgocallback_gofunc.
   634  TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
   635  	LEAL	fn+0(FP), AX
   636  	MOVL	AX, 0(SP)
   637  	MOVL	frame+4(FP), AX
   638  	MOVL	AX, 4(SP)
   639  	MOVL	framesize+8(FP), AX
   640  	MOVL	AX, 8(SP)
   641  	MOVL	ctxt+12(FP), AX
   642  	MOVL	AX, 12(SP)
   643  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   644  	CALL	AX
   645  	RET
   646  
   647  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   648  // See cgocall.go for more details.
   649  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
   650  	NO_LOCAL_POINTERS
   651  
   652  	// If g is nil, Go did not create the current thread.
   653  	// Call needm to obtain one for temporary use.
   654  	// In this case, we're running on the thread stack, so there's
   655  	// lots of space, but the linker doesn't know. Hide the call from
   656  	// the linker analysis by using an indirect call through AX.
   657  	get_tls(CX)
   658  #ifdef GOOS_windows
   659  	MOVL	$0, BP
   660  	CMPL	CX, $0
   661  	JEQ	2(PC) // TODO
   662  #endif
   663  	MOVL	g(CX), BP
   664  	CMPL	BP, $0
   665  	JEQ	needm
   666  	MOVL	g_m(BP), BP
   667  	MOVL	BP, DX // saved copy of oldm
   668  	JMP	havem
   669  needm:
   670  	MOVL	$0, 0(SP)
   671  	MOVL	$runtime·needm(SB), AX
   672  	CALL	AX
   673  	MOVL	0(SP), DX
   674  	get_tls(CX)
   675  	MOVL	g(CX), BP
   676  	MOVL	g_m(BP), BP
   677  
   678  	// Set m->sched.sp = SP, so that if a panic happens
   679  	// during the function we are about to execute, it will
   680  	// have a valid SP to run on the g0 stack.
   681  	// The next few lines (after the havem label)
   682  	// will save this SP onto the stack and then write
   683  	// the same SP back to m->sched.sp. That seems redundant,
   684  	// but if an unrecovered panic happens, unwindm will
   685  	// restore the g->sched.sp from the stack location
   686  	// and then systemstack will try to use it. If we don't set it here,
   687  	// that restored SP will be uninitialized (typically 0) and
   688  	// will not be usable.
   689  	MOVL	m_g0(BP), SI
   690  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   691  
   692  havem:
   693  	// Now there's a valid m, and we're running on its m->g0.
   694  	// Save current m->g0->sched.sp on stack and then set it to SP.
   695  	// Save current sp in m->g0->sched.sp in preparation for
   696  	// switch back to m->curg stack.
   697  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   698  	MOVL	m_g0(BP), SI
   699  	MOVL	(g_sched+gobuf_sp)(SI), AX
   700  	MOVL	AX, 0(SP)
   701  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   702  
   703  	// Switch to m->curg stack and call runtime.cgocallbackg.
   704  	// Because we are taking over the execution of m->curg
   705  	// but *not* resuming what had been running, we need to
   706  	// save that information (m->curg->sched) so we can restore it.
   707  	// We can restore m->curg->sched.sp easily, because calling
   708  	// runtime.cgocallbackg leaves SP unchanged upon return.
   709  	// To save m->curg->sched.pc, we push it onto the stack.
   710  	// This has the added benefit that it looks to the traceback
   711  	// routine like cgocallbackg is going to return to that
   712  	// PC (because the frame we allocate below has the same
   713  	// size as cgocallback_gofunc's frame declared above)
   714  	// so that the traceback will seamlessly trace back into
   715  	// the earlier calls.
   716  	//
   717  	// In the new goroutine, 4(SP) holds the saved oldm (DX) register.
   718  	// 8(SP) is unused.
   719  	MOVL	m_curg(BP), SI
   720  	MOVL	SI, g(CX)
   721  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   722  	MOVL	(g_sched+gobuf_pc)(SI), BP
   723  	MOVL	BP, -4(DI)
   724  	MOVL	ctxt+12(FP), CX
   725  	LEAL	-(4+12)(DI), SP
   726  	MOVL	DX, 4(SP)
   727  	MOVL	CX, 0(SP)
   728  	CALL	runtime·cgocallbackg(SB)
   729  	MOVL	4(SP), DX
   730  
   731  	// Restore g->sched (== m->curg->sched) from saved values.
   732  	get_tls(CX)
   733  	MOVL	g(CX), SI
   734  	MOVL	12(SP), BP
   735  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   736  	LEAL	(12+4)(SP), DI
   737  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   738  
   739  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   740  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   741  	// so we do not have to restore it.)
   742  	MOVL	g(CX), BP
   743  	MOVL	g_m(BP), BP
   744  	MOVL	m_g0(BP), SI
   745  	MOVL	SI, g(CX)
   746  	MOVL	(g_sched+gobuf_sp)(SI), SP
   747  	MOVL	0(SP), AX
   748  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   749  	
   750  	// If the m on entry was nil, we called needm above to borrow an m
   751  	// for the duration of the call. Since the call is over, return it with dropm.
   752  	CMPL	DX, $0
   753  	JNE 3(PC)
   754  	MOVL	$runtime·dropm(SB), AX
   755  	CALL	AX
   756  
   757  	// Done!
   758  	RET
   759  
   760  // void setg(G*); set g. for use by needm.
   761  TEXT runtime·setg(SB), NOSPLIT, $0-4
   762  	MOVL	gg+0(FP), BX
   763  #ifdef GOOS_windows
   764  	CMPL	BX, $0
   765  	JNE	settls
   766  	MOVL	$0, 0x14(FS)
   767  	RET
   768  settls:
   769  	MOVL	g_m(BX), AX
   770  	LEAL	m_tls(AX), AX
   771  	MOVL	AX, 0x14(FS)
   772  #endif
   773  	get_tls(CX)
   774  	MOVL	BX, g(CX)
   775  	RET
   776  
   777  // void setg_gcc(G*); set g. for use by gcc
   778  TEXT setg_gcc<>(SB), NOSPLIT, $0
   779  	get_tls(AX)
   780  	MOVL	gg+0(FP), DX
   781  	MOVL	DX, g(AX)
   782  	RET
   783  
   784  // check that SP is in range [g->stack.lo, g->stack.hi)
   785  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   786  	get_tls(CX)
   787  	MOVL	g(CX), AX
   788  	CMPL	(g_stack+stack_hi)(AX), SP
   789  	JHI	2(PC)
   790  	INT	$3
   791  	CMPL	SP, (g_stack+stack_lo)(AX)
   792  	JHI	2(PC)
   793  	INT	$3
   794  	RET
   795  
   796  TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
   797  	MOVL	argp+0(FP),AX		// addr of first arg
   798  	MOVL	-4(AX),AX		// get calling pc
   799  	MOVL	AX, ret+4(FP)
   800  	RET
   801  
   802  // func cputicks() int64
   803  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   804  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence
   805  	JEQ	done
   806  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   807  	JNE	mfence
   808  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   809  	JMP	done
   810  mfence:
   811  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   812  done:
   813  	RDTSC
   814  	MOVL	AX, ret_lo+0(FP)
   815  	MOVL	DX, ret_hi+4(FP)
   816  	RET
   817  
   818  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   819  	// set up ldt 7 to point at m0.tls
   820  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   821  	// the entry number is just a hint.  setldt will set up GS with what it used.
   822  	MOVL	$7, 0(SP)
   823  	LEAL	runtime·m0+m_tls(SB), AX
   824  	MOVL	AX, 4(SP)
   825  	MOVL	$32, 8(SP)	// sizeof(tls array)
   826  	CALL	runtime·setldt(SB)
   827  	RET
   828  
   829  TEXT runtime·emptyfunc(SB),0,$0-0
   830  	RET
   831  
   832  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   833  // redirects to memhash(p, h, size) using the size
   834  // stored in the closure.
   835  TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
   836  	GO_ARGS
   837  	NO_LOCAL_POINTERS
   838  	MOVL	p+0(FP), AX
   839  	MOVL	h+4(FP), BX
   840  	MOVL	4(DX), CX
   841  	MOVL	AX, 0(SP)
   842  	MOVL	BX, 4(SP)
   843  	MOVL	CX, 8(SP)
   844  	CALL	runtime·memhash(SB)
   845  	MOVL	12(SP), AX
   846  	MOVL	AX, ret+8(FP)
   847  	RET
   848  
   849  // hash function using AES hardware instructions
   850  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   851  	MOVL	p+0(FP), AX	// ptr to data
   852  	MOVL	s+8(FP), BX	// size
   853  	LEAL	ret+12(FP), DX
   854  	JMP	runtime·aeshashbody(SB)
   855  
   856  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   857  	MOVL	p+0(FP), AX	// ptr to string object
   858  	MOVL	4(AX), BX	// length of string
   859  	MOVL	(AX), AX	// string data
   860  	LEAL	ret+8(FP), DX
   861  	JMP	runtime·aeshashbody(SB)
   862  
   863  // AX: data
   864  // BX: length
   865  // DX: address to put return value
   866  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   867  	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
   868  	PINSRW	$4, BX, X0	            // 16 bits of length
   869  	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
   870  	MOVO	X0, X1                      // save unscrambled seed
   871  	PXOR	runtime·aeskeysched(SB), X0 // xor in per-process seed
   872  	AESENC	X0, X0                      // scramble seed
   873  
   874  	CMPL	BX, $16
   875  	JB	aes0to15
   876  	JE	aes16
   877  	CMPL	BX, $32
   878  	JBE	aes17to32
   879  	CMPL	BX, $64
   880  	JBE	aes33to64
   881  	JMP	aes65plus
   882  	
   883  aes0to15:
   884  	TESTL	BX, BX
   885  	JE	aes0
   886  
   887  	ADDL	$16, AX
   888  	TESTW	$0xff0, AX
   889  	JE	endofpage
   890  
   891  	// 16 bytes loaded at this address won't cross
   892  	// a page boundary, so we can load it directly.
   893  	MOVOU	-16(AX), X1
   894  	ADDL	BX, BX
   895  	PAND	masks<>(SB)(BX*8), X1
   896  
   897  final1:	
   898  	AESENC	X0, X1  // scramble input, xor in seed
   899  	AESENC	X1, X1  // scramble combo 2 times
   900  	AESENC	X1, X1
   901  	MOVL	X1, (DX)
   902  	RET
   903  
   904  endofpage:
   905  	// address ends in 1111xxxx. Might be up against
   906  	// a page boundary, so load ending at last byte.
   907  	// Then shift bytes down using pshufb.
   908  	MOVOU	-32(AX)(BX*1), X1
   909  	ADDL	BX, BX
   910  	PSHUFB	shifts<>(SB)(BX*8), X1
   911  	JMP	final1
   912  
   913  aes0:
   914  	// Return scrambled input seed
   915  	AESENC	X0, X0
   916  	MOVL	X0, (DX)
   917  	RET
   918  
   919  aes16:
   920  	MOVOU	(AX), X1
   921  	JMP	final1
   922  
   923  aes17to32:
   924  	// make second starting seed
   925  	PXOR	runtime·aeskeysched+16(SB), X1
   926  	AESENC	X1, X1
   927  	
   928  	// load data to be hashed
   929  	MOVOU	(AX), X2
   930  	MOVOU	-16(AX)(BX*1), X3
   931  
   932  	// scramble 3 times
   933  	AESENC	X0, X2
   934  	AESENC	X1, X3
   935  	AESENC	X2, X2
   936  	AESENC	X3, X3
   937  	AESENC	X2, X2
   938  	AESENC	X3, X3
   939  
   940  	// combine results
   941  	PXOR	X3, X2
   942  	MOVL	X2, (DX)
   943  	RET
   944  
   945  aes33to64:
   946  	// make 3 more starting seeds
   947  	MOVO	X1, X2
   948  	MOVO	X1, X3
   949  	PXOR	runtime·aeskeysched+16(SB), X1
   950  	PXOR	runtime·aeskeysched+32(SB), X2
   951  	PXOR	runtime·aeskeysched+48(SB), X3
   952  	AESENC	X1, X1
   953  	AESENC	X2, X2
   954  	AESENC	X3, X3
   955  	
   956  	MOVOU	(AX), X4
   957  	MOVOU	16(AX), X5
   958  	MOVOU	-32(AX)(BX*1), X6
   959  	MOVOU	-16(AX)(BX*1), X7
   960  	
   961  	AESENC	X0, X4
   962  	AESENC	X1, X5
   963  	AESENC	X2, X6
   964  	AESENC	X3, X7
   965  	
   966  	AESENC	X4, X4
   967  	AESENC	X5, X5
   968  	AESENC	X6, X6
   969  	AESENC	X7, X7
   970  	
   971  	AESENC	X4, X4
   972  	AESENC	X5, X5
   973  	AESENC	X6, X6
   974  	AESENC	X7, X7
   975  
   976  	PXOR	X6, X4
   977  	PXOR	X7, X5
   978  	PXOR	X5, X4
   979  	MOVL	X4, (DX)
   980  	RET
   981  
   982  aes65plus:
   983  	// make 3 more starting seeds
   984  	MOVO	X1, X2
   985  	MOVO	X1, X3
   986  	PXOR	runtime·aeskeysched+16(SB), X1
   987  	PXOR	runtime·aeskeysched+32(SB), X2
   988  	PXOR	runtime·aeskeysched+48(SB), X3
   989  	AESENC	X1, X1
   990  	AESENC	X2, X2
   991  	AESENC	X3, X3
   992  	
   993  	// start with last (possibly overlapping) block
   994  	MOVOU	-64(AX)(BX*1), X4
   995  	MOVOU	-48(AX)(BX*1), X5
   996  	MOVOU	-32(AX)(BX*1), X6
   997  	MOVOU	-16(AX)(BX*1), X7
   998  
   999  	// scramble state once
  1000  	AESENC	X0, X4
  1001  	AESENC	X1, X5
  1002  	AESENC	X2, X6
  1003  	AESENC	X3, X7
  1004  
  1005  	// compute number of remaining 64-byte blocks
  1006  	DECL	BX
  1007  	SHRL	$6, BX
  1008  	
  1009  aesloop:
  1010  	// scramble state, xor in a block
  1011  	MOVOU	(AX), X0
  1012  	MOVOU	16(AX), X1
  1013  	MOVOU	32(AX), X2
  1014  	MOVOU	48(AX), X3
  1015  	AESENC	X0, X4
  1016  	AESENC	X1, X5
  1017  	AESENC	X2, X6
  1018  	AESENC	X3, X7
  1019  
  1020  	// scramble state
  1021  	AESENC	X4, X4
  1022  	AESENC	X5, X5
  1023  	AESENC	X6, X6
  1024  	AESENC	X7, X7
  1025  
  1026  	ADDL	$64, AX
  1027  	DECL	BX
  1028  	JNE	aesloop
  1029  
  1030  	// 2 more scrambles to finish
  1031  	AESENC	X4, X4
  1032  	AESENC	X5, X5
  1033  	AESENC	X6, X6
  1034  	AESENC	X7, X7
  1035  	
  1036  	AESENC	X4, X4
  1037  	AESENC	X5, X5
  1038  	AESENC	X6, X6
  1039  	AESENC	X7, X7
  1040  
  1041  	PXOR	X6, X4
  1042  	PXOR	X7, X5
  1043  	PXOR	X5, X4
  1044  	MOVL	X4, (DX)
  1045  	RET
  1046  
  1047  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1048  	MOVL	p+0(FP), AX	// ptr to data
  1049  	MOVL	h+4(FP), X0	// seed
  1050  	PINSRD	$1, (AX), X0	// data
  1051  	AESENC	runtime·aeskeysched+0(SB), X0
  1052  	AESENC	runtime·aeskeysched+16(SB), X0
  1053  	AESENC	runtime·aeskeysched+32(SB), X0
  1054  	MOVL	X0, ret+8(FP)
  1055  	RET
  1056  
  1057  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1058  	MOVL	p+0(FP), AX	// ptr to data
  1059  	MOVQ	(AX), X0	// data
  1060  	PINSRD	$2, h+4(FP), X0	// seed
  1061  	AESENC	runtime·aeskeysched+0(SB), X0
  1062  	AESENC	runtime·aeskeysched+16(SB), X0
  1063  	AESENC	runtime·aeskeysched+32(SB), X0
  1064  	MOVL	X0, ret+8(FP)
  1065  	RET
  1066  
  1067  // simple mask to get rid of data in the high part of the register.
  1068  DATA masks<>+0x00(SB)/4, $0x00000000
  1069  DATA masks<>+0x04(SB)/4, $0x00000000
  1070  DATA masks<>+0x08(SB)/4, $0x00000000
  1071  DATA masks<>+0x0c(SB)/4, $0x00000000
  1072  	
  1073  DATA masks<>+0x10(SB)/4, $0x000000ff
  1074  DATA masks<>+0x14(SB)/4, $0x00000000
  1075  DATA masks<>+0x18(SB)/4, $0x00000000
  1076  DATA masks<>+0x1c(SB)/4, $0x00000000
  1077  	
  1078  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1079  DATA masks<>+0x24(SB)/4, $0x00000000
  1080  DATA masks<>+0x28(SB)/4, $0x00000000
  1081  DATA masks<>+0x2c(SB)/4, $0x00000000
  1082  	
  1083  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1084  DATA masks<>+0x34(SB)/4, $0x00000000
  1085  DATA masks<>+0x38(SB)/4, $0x00000000
  1086  DATA masks<>+0x3c(SB)/4, $0x00000000
  1087  	
  1088  DATA masks<>+0x40(SB)/4, $0xffffffff
  1089  DATA masks<>+0x44(SB)/4, $0x00000000
  1090  DATA masks<>+0x48(SB)/4, $0x00000000
  1091  DATA masks<>+0x4c(SB)/4, $0x00000000
  1092  	
  1093  DATA masks<>+0x50(SB)/4, $0xffffffff
  1094  DATA masks<>+0x54(SB)/4, $0x000000ff
  1095  DATA masks<>+0x58(SB)/4, $0x00000000
  1096  DATA masks<>+0x5c(SB)/4, $0x00000000
  1097  	
  1098  DATA masks<>+0x60(SB)/4, $0xffffffff
  1099  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1100  DATA masks<>+0x68(SB)/4, $0x00000000
  1101  DATA masks<>+0x6c(SB)/4, $0x00000000
  1102  	
  1103  DATA masks<>+0x70(SB)/4, $0xffffffff
  1104  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1105  DATA masks<>+0x78(SB)/4, $0x00000000
  1106  DATA masks<>+0x7c(SB)/4, $0x00000000
  1107  	
  1108  DATA masks<>+0x80(SB)/4, $0xffffffff
  1109  DATA masks<>+0x84(SB)/4, $0xffffffff
  1110  DATA masks<>+0x88(SB)/4, $0x00000000
  1111  DATA masks<>+0x8c(SB)/4, $0x00000000
  1112  	
  1113  DATA masks<>+0x90(SB)/4, $0xffffffff
  1114  DATA masks<>+0x94(SB)/4, $0xffffffff
  1115  DATA masks<>+0x98(SB)/4, $0x000000ff
  1116  DATA masks<>+0x9c(SB)/4, $0x00000000
  1117  	
  1118  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1119  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1120  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1121  DATA masks<>+0xac(SB)/4, $0x00000000
  1122  	
  1123  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1124  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1125  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1126  DATA masks<>+0xbc(SB)/4, $0x00000000
  1127  	
  1128  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1129  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1130  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1131  DATA masks<>+0xcc(SB)/4, $0x00000000
  1132  	
  1133  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1134  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1135  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1136  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1137  	
  1138  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1139  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1140  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1141  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1142  	
  1143  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1144  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1145  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1146  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1147  
  1148  GLOBL masks<>(SB),RODATA,$256
  1149  
  1150  // these are arguments to pshufb. They move data down from
  1151  // the high bytes of the register to the low bytes of the register.
  1152  // index is how many bytes to move.
  1153  DATA shifts<>+0x00(SB)/4, $0x00000000
  1154  DATA shifts<>+0x04(SB)/4, $0x00000000
  1155  DATA shifts<>+0x08(SB)/4, $0x00000000
  1156  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1157  	
  1158  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1159  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1160  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1161  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1162  	
  1163  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1164  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1165  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1166  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1167  	
  1168  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1169  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1170  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1171  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1172  	
  1173  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1174  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1175  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1176  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1177  	
  1178  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1179  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1180  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1181  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1182  	
  1183  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1184  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1185  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1186  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1187  	
  1188  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1189  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1190  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1191  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1192  	
  1193  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1194  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1195  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1196  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1197  	
  1198  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1199  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1200  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1201  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1202  	
  1203  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1204  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1205  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1206  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1207  	
  1208  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1209  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1210  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1211  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1212  	
  1213  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1214  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1215  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1216  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1217  	
  1218  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1219  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1220  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1221  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1222  	
  1223  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1224  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1225  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1226  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1227  	
  1228  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1229  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1230  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1231  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1232  
  1233  GLOBL shifts<>(SB),RODATA,$256
  1234  
  1235  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1236  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1237  	MOVL	$masks<>(SB), AX
  1238  	MOVL	$shifts<>(SB), BX
  1239  	ORL	BX, AX
  1240  	TESTL	$15, AX
  1241  	SETEQ	ret+0(FP)
  1242  	RET
  1243  
  1244  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1245  TEXT runtime·memequal(SB),NOSPLIT,$0-13
  1246  	MOVL	a+0(FP), SI
  1247  	MOVL	b+4(FP), DI
  1248  	CMPL	SI, DI
  1249  	JEQ	eq
  1250  	MOVL	size+8(FP), BX
  1251  	LEAL	ret+12(FP), AX
  1252  	JMP	runtime·memeqbody(SB)
  1253  eq:
  1254  	MOVB    $1, ret+12(FP)
  1255  	RET
  1256  
  1257  // memequal_varlen(a, b unsafe.Pointer) bool
  1258  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1259  	MOVL    a+0(FP), SI
  1260  	MOVL    b+4(FP), DI
  1261  	CMPL    SI, DI
  1262  	JEQ     eq
  1263  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1264  	LEAL	ret+8(FP), AX
  1265  	JMP	runtime·memeqbody(SB)
  1266  eq:
  1267  	MOVB    $1, ret+8(FP)
  1268  	RET
  1269  
  1270  // eqstring tests whether two strings are equal.
  1271  // The compiler guarantees that strings passed
  1272  // to eqstring have equal length.
  1273  // See runtime_test.go:eqstring_generic for
  1274  // equivalent Go code.
  1275  TEXT runtime·eqstring(SB),NOSPLIT,$0-17
  1276  	MOVL	s1_base+0(FP), SI
  1277  	MOVL	s2_base+8(FP), DI
  1278  	CMPL	SI, DI
  1279  	JEQ	same
  1280  	MOVL	s1_len+4(FP), BX
  1281  	LEAL	ret+16(FP), AX
  1282  	JMP	runtime·memeqbody(SB)
  1283  same:
  1284  	MOVB	$1, ret+16(FP)
  1285  	RET
  1286  
  1287  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1288  	MOVL	a_len+4(FP), BX
  1289  	MOVL	b_len+16(FP), CX
  1290  	CMPL	BX, CX
  1291  	JNE	eqret
  1292  	MOVL	a+0(FP), SI
  1293  	MOVL	b+12(FP), DI
  1294  	LEAL	ret+24(FP), AX
  1295  	JMP	runtime·memeqbody(SB)
  1296  eqret:
  1297  	MOVB	$0, ret+24(FP)
  1298  	RET
  1299  
  1300  // a in SI
  1301  // b in DI
  1302  // count in BX
  1303  // address of result byte in AX
  1304  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1305  	CMPL	BX, $4
  1306  	JB	small
  1307  
  1308  	// 64 bytes at a time using xmm registers
  1309  hugeloop:
  1310  	CMPL	BX, $64
  1311  	JB	bigloop
  1312  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1313  	JE	bigloop
  1314  	MOVOU	(SI), X0
  1315  	MOVOU	(DI), X1
  1316  	MOVOU	16(SI), X2
  1317  	MOVOU	16(DI), X3
  1318  	MOVOU	32(SI), X4
  1319  	MOVOU	32(DI), X5
  1320  	MOVOU	48(SI), X6
  1321  	MOVOU	48(DI), X7
  1322  	PCMPEQB	X1, X0
  1323  	PCMPEQB	X3, X2
  1324  	PCMPEQB	X5, X4
  1325  	PCMPEQB	X7, X6
  1326  	PAND	X2, X0
  1327  	PAND	X6, X4
  1328  	PAND	X4, X0
  1329  	PMOVMSKB X0, DX
  1330  	ADDL	$64, SI
  1331  	ADDL	$64, DI
  1332  	SUBL	$64, BX
  1333  	CMPL	DX, $0xffff
  1334  	JEQ	hugeloop
  1335  	MOVB	$0, (AX)
  1336  	RET
  1337  
  1338  	// 4 bytes at a time using 32-bit register
  1339  bigloop:
  1340  	CMPL	BX, $4
  1341  	JBE	leftover
  1342  	MOVL	(SI), CX
  1343  	MOVL	(DI), DX
  1344  	ADDL	$4, SI
  1345  	ADDL	$4, DI
  1346  	SUBL	$4, BX
  1347  	CMPL	CX, DX
  1348  	JEQ	bigloop
  1349  	MOVB	$0, (AX)
  1350  	RET
  1351  
  1352  	// remaining 0-4 bytes
  1353  leftover:
  1354  	MOVL	-4(SI)(BX*1), CX
  1355  	MOVL	-4(DI)(BX*1), DX
  1356  	CMPL	CX, DX
  1357  	SETEQ	(AX)
  1358  	RET
  1359  
  1360  small:
  1361  	CMPL	BX, $0
  1362  	JEQ	equal
  1363  
  1364  	LEAL	0(BX*8), CX
  1365  	NEGL	CX
  1366  
  1367  	MOVL	SI, DX
  1368  	CMPB	DX, $0xfc
  1369  	JA	si_high
  1370  
  1371  	// load at SI won't cross a page boundary.
  1372  	MOVL	(SI), SI
  1373  	JMP	si_finish
  1374  si_high:
  1375  	// address ends in 111111xx. Load up to bytes we want, move to correct position.
  1376  	MOVL	-4(SI)(BX*1), SI
  1377  	SHRL	CX, SI
  1378  si_finish:
  1379  
  1380  	// same for DI.
  1381  	MOVL	DI, DX
  1382  	CMPB	DX, $0xfc
  1383  	JA	di_high
  1384  	MOVL	(DI), DI
  1385  	JMP	di_finish
  1386  di_high:
  1387  	MOVL	-4(DI)(BX*1), DI
  1388  	SHRL	CX, DI
  1389  di_finish:
  1390  
  1391  	SUBL	SI, DI
  1392  	SHLL	CX, DI
  1393  equal:
  1394  	SETEQ	(AX)
  1395  	RET
  1396  
  1397  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1398  	MOVL	s1_base+0(FP), SI
  1399  	MOVL	s1_len+4(FP), BX
  1400  	MOVL	s2_base+8(FP), DI
  1401  	MOVL	s2_len+12(FP), DX
  1402  	LEAL	ret+16(FP), AX
  1403  	JMP	runtime·cmpbody(SB)
  1404  
  1405  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1406  	MOVL	s1+0(FP), SI
  1407  	MOVL	s1+4(FP), BX
  1408  	MOVL	s2+12(FP), DI
  1409  	MOVL	s2+16(FP), DX
  1410  	LEAL	ret+24(FP), AX
  1411  	JMP	runtime·cmpbody(SB)
  1412  
  1413  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1414  	MOVL	s+0(FP), SI
  1415  	MOVL	s_len+4(FP), CX
  1416  	MOVB	c+12(FP), AL
  1417  	MOVL	SI, DI
  1418  	CLD; REPN; SCASB
  1419  	JZ 3(PC)
  1420  	MOVL	$-1, ret+16(FP)
  1421  	RET
  1422  	SUBL	SI, DI
  1423  	SUBL	$1, DI
  1424  	MOVL	DI, ret+16(FP)
  1425  	RET
  1426  
  1427  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1428  	MOVL	s+0(FP), SI
  1429  	MOVL	s_len+4(FP), CX
  1430  	MOVB	c+8(FP), AL
  1431  	MOVL	SI, DI
  1432  	CLD; REPN; SCASB
  1433  	JZ 3(PC)
  1434  	MOVL	$-1, ret+12(FP)
  1435  	RET
  1436  	SUBL	SI, DI
  1437  	SUBL	$1, DI
  1438  	MOVL	DI, ret+12(FP)
  1439  	RET
  1440  
  1441  // input:
  1442  //   SI = a
  1443  //   DI = b
  1444  //   BX = alen
  1445  //   DX = blen
  1446  //   AX = address of return word (set to 1/0/-1)
  1447  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1448  	MOVL	DX, BP
  1449  	SUBL	BX, DX // DX = blen-alen
  1450  	JLE	2(PC)
  1451  	MOVL	BX, BP // BP = min(alen, blen)
  1452  	CMPL	SI, DI
  1453  	JEQ	allsame
  1454  	CMPL	BP, $4
  1455  	JB	small
  1456  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1457  	JE	mediumloop
  1458  largeloop:
  1459  	CMPL	BP, $16
  1460  	JB	mediumloop
  1461  	MOVOU	(SI), X0
  1462  	MOVOU	(DI), X1
  1463  	PCMPEQB X0, X1
  1464  	PMOVMSKB X1, BX
  1465  	XORL	$0xffff, BX	// convert EQ to NE
  1466  	JNE	diff16	// branch if at least one byte is not equal
  1467  	ADDL	$16, SI
  1468  	ADDL	$16, DI
  1469  	SUBL	$16, BP
  1470  	JMP	largeloop
  1471  
  1472  diff16:
  1473  	BSFL	BX, BX	// index of first byte that differs
  1474  	XORL	DX, DX
  1475  	MOVB	(SI)(BX*1), CX
  1476  	CMPB	CX, (DI)(BX*1)
  1477  	SETHI	DX
  1478  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1479  	MOVL	DX, (AX)
  1480  	RET
  1481  
  1482  mediumloop:
  1483  	CMPL	BP, $4
  1484  	JBE	_0through4
  1485  	MOVL	(SI), BX
  1486  	MOVL	(DI), CX
  1487  	CMPL	BX, CX
  1488  	JNE	diff4
  1489  	ADDL	$4, SI
  1490  	ADDL	$4, DI
  1491  	SUBL	$4, BP
  1492  	JMP	mediumloop
  1493  
  1494  _0through4:
  1495  	MOVL	-4(SI)(BP*1), BX
  1496  	MOVL	-4(DI)(BP*1), CX
  1497  	CMPL	BX, CX
  1498  	JEQ	allsame
  1499  
  1500  diff4:
  1501  	BSWAPL	BX	// reverse order of bytes
  1502  	BSWAPL	CX
  1503  	XORL	BX, CX	// find bit differences
  1504  	BSRL	CX, CX	// index of highest bit difference
  1505  	SHRL	CX, BX	// move a's bit to bottom
  1506  	ANDL	$1, BX	// mask bit
  1507  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1508  	MOVL	BX, (AX)
  1509  	RET
  1510  
  1511  	// 0-3 bytes in common
  1512  small:
  1513  	LEAL	(BP*8), CX
  1514  	NEGL	CX
  1515  	JEQ	allsame
  1516  
  1517  	// load si
  1518  	CMPB	SI, $0xfc
  1519  	JA	si_high
  1520  	MOVL	(SI), SI
  1521  	JMP	si_finish
  1522  si_high:
  1523  	MOVL	-4(SI)(BP*1), SI
  1524  	SHRL	CX, SI
  1525  si_finish:
  1526  	SHLL	CX, SI
  1527  
  1528  	// same for di
  1529  	CMPB	DI, $0xfc
  1530  	JA	di_high
  1531  	MOVL	(DI), DI
  1532  	JMP	di_finish
  1533  di_high:
  1534  	MOVL	-4(DI)(BP*1), DI
  1535  	SHRL	CX, DI
  1536  di_finish:
  1537  	SHLL	CX, DI
  1538  
  1539  	BSWAPL	SI	// reverse order of bytes
  1540  	BSWAPL	DI
  1541  	XORL	SI, DI	// find bit differences
  1542  	JEQ	allsame
  1543  	BSRL	DI, CX	// index of highest bit difference
  1544  	SHRL	CX, SI	// move a's bit to bottom
  1545  	ANDL	$1, SI	// mask bit
  1546  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1547  	MOVL	BX, (AX)
  1548  	RET
  1549  
  1550  	// all the bytes in common are the same, so we just need
  1551  	// to compare the lengths.
  1552  allsame:
  1553  	XORL	BX, BX
  1554  	XORL	CX, CX
  1555  	TESTL	DX, DX
  1556  	SETLT	BX	// 1 if alen > blen
  1557  	SETEQ	CX	// 1 if alen == blen
  1558  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1559  	MOVL	BX, (AX)
  1560  	RET
  1561  
  1562  TEXT runtime·return0(SB), NOSPLIT, $0
  1563  	MOVL	$0, AX
  1564  	RET
  1565  
  1566  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1567  // Must obey the gcc calling convention.
  1568  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1569  	get_tls(CX)
  1570  	MOVL	g(CX), AX
  1571  	MOVL	g_m(AX), AX
  1572  	MOVL	m_curg(AX), AX
  1573  	MOVL	(g_stack+stack_hi)(AX), AX
  1574  	RET
  1575  
  1576  // The top-most function running on a goroutine
  1577  // returns to goexit+PCQuantum.
  1578  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1579  	BYTE	$0x90	// NOP
  1580  	CALL	runtime·goexit1(SB)	// does not return
  1581  	// traceback from goexit1 must hit code range of goexit
  1582  	BYTE	$0x90	// NOP
  1583  
  1584  // Prefetching doesn't seem to help.
  1585  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
  1586  	RET
  1587  
  1588  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
  1589  	RET
  1590  
  1591  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
  1592  	RET
  1593  
  1594  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
  1595  	RET
  1596  
  1597  // Add a module's moduledata to the linked list of moduledata objects. This
  1598  // is called from .init_array by a function generated in the linker and so
  1599  // follows the platform ABI wrt register preservation -- it only touches AX,
  1600  // CX (implicitly) and DX, but it does not follow the ABI wrt arguments:
  1601  // instead the pointer to the moduledata is passed in AX.
  1602  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1603         MOVL    runtime·lastmoduledatap(SB), DX
  1604         MOVL    AX, moduledata_next(DX)
  1605         MOVL    AX, runtime·lastmoduledatap(SB)
  1606         RET
  1607  
  1608  TEXT runtime·uint32tofloat64(SB),NOSPLIT,$8-12
  1609  	MOVL	a+0(FP), AX
  1610  	MOVL	AX, 0(SP)
  1611  	MOVL	$0, 4(SP)
  1612  	FMOVV	0(SP), F0
  1613  	FMOVDP	F0, ret+4(FP)
  1614  	RET
  1615  
  1616  TEXT runtime·float64touint32(SB),NOSPLIT,$12-12
  1617  	FMOVD	a+0(FP), F0
  1618  	FSTCW	0(SP)
  1619  	FLDCW	runtime·controlWord64trunc(SB)
  1620  	FMOVVP	F0, 4(SP)
  1621  	FLDCW	0(SP)
  1622  	MOVL	4(SP), AX
  1623  	MOVL	AX, ret+8(FP)
  1624  	RET