github.com/slayercat/go@v0.0.0-20170428012452-c51559813f61/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	SUBL	$128, SP		// plenty of scratch
    15  	ANDL	$~15, SP
    16  	MOVL	AX, 120(SP)		// save argc, argv away
    17  	MOVL	BX, 124(SP)
    18  
    19  	// set default stack bounds.
    20  	// _cgo_init may update stackguard.
    21  	MOVL	$runtime·g0(SB), BP
    22  	LEAL	(-64*1024+104)(SP), BX
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	BX, g_stackguard1(BP)
    25  	MOVL	BX, (g_stack+stack_lo)(BP)
    26  	MOVL	SP, (g_stack+stack_hi)(BP)
    27  	
    28  	// find out information about the processor we're on
    29  #ifdef GOOS_nacl // NaCl doesn't like PUSHFL/POPFL
    30  	JMP 	has_cpuid
    31  #else
    32  	// first see if CPUID instruction is supported.
    33  	PUSHFL
    34  	PUSHFL
    35  	XORL	$(1<<21), 0(SP) // flip ID bit
    36  	POPFL
    37  	PUSHFL
    38  	POPL	AX
    39  	XORL	0(SP), AX
    40  	POPFL	// restore EFLAGS
    41  	TESTL	$(1<<21), AX
    42  	JNE 	has_cpuid
    43  #endif
    44  
    45  bad_proc: // show that the program requires MMX.
    46  	MOVL	$2, 0(SP)
    47  	MOVL	$bad_proc_msg<>(SB), 4(SP)
    48  	MOVL	$0x3d, 8(SP)
    49  	CALL	runtime·write(SB)
    50  	MOVL	$1, 0(SP)
    51  	CALL	runtime·exit(SB)
    52  	INT	$3
    53  
    54  has_cpuid:
    55  	MOVL	$0, AX
    56  	CPUID
    57  	MOVL	AX, SI
    58  	CMPL	AX, $0
    59  	JE	nocpuinfo
    60  
    61  	// Figure out how to serialize RDTSC.
    62  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    63  	// Don't know about the rest, so let's do MFENCE.
    64  	CMPL	BX, $0x756E6547  // "Genu"
    65  	JNE	notintel
    66  	CMPL	DX, $0x49656E69  // "ineI"
    67  	JNE	notintel
    68  	CMPL	CX, $0x6C65746E  // "ntel"
    69  	JNE	notintel
    70  	MOVB	$1, runtime·isIntel(SB)
    71  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    72  notintel:
    73  
    74  	// Load EAX=1 cpuid flags
    75  	MOVL	$1, AX
    76  	CPUID
    77  	MOVL	CX, DI // Move to global variable clobbers CX when generating PIC
    78  	MOVL	AX, runtime·cpuid_eax(SB)
    79  	MOVL	DI, runtime·cpuid_ecx(SB)
    80  	MOVL	DX, runtime·cpuid_edx(SB)
    81  
    82  	// Check for MMX support
    83  	TESTL	$(1<<23), DX	// MMX
    84  	JZ 	bad_proc
    85  
    86  	// Load EAX=7/ECX=0 cpuid flags
    87  	CMPL	SI, $7
    88  	JLT	nocpuinfo
    89  	MOVL	$7, AX
    90  	MOVL	$0, CX
    91  	CPUID
    92  	MOVL	BX, runtime·cpuid_ebx7(SB)
    93  
    94  nocpuinfo:	
    95  
    96  	// if there is an _cgo_init, call it to let it
    97  	// initialize and to set up GS.  if not,
    98  	// we set up GS ourselves.
    99  	MOVL	_cgo_init(SB), AX
   100  	TESTL	AX, AX
   101  	JZ	needtls
   102  	MOVL	$setg_gcc<>(SB), BX
   103  	MOVL	BX, 4(SP)
   104  	MOVL	BP, 0(SP)
   105  	CALL	AX
   106  
   107  	// update stackguard after _cgo_init
   108  	MOVL	$runtime·g0(SB), CX
   109  	MOVL	(g_stack+stack_lo)(CX), AX
   110  	ADDL	$const__StackGuard, AX
   111  	MOVL	AX, g_stackguard0(CX)
   112  	MOVL	AX, g_stackguard1(CX)
   113  
   114  #ifndef GOOS_windows
   115  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
   116  	JMP ok
   117  #endif
   118  needtls:
   119  #ifdef GOOS_plan9
   120  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
   121  	JMP	ok
   122  #endif
   123  
   124  	// set up %gs
   125  	CALL	runtime·ldt0setup(SB)
   126  
   127  	// store through it, to make sure it works
   128  	get_tls(BX)
   129  	MOVL	$0x123, g(BX)
   130  	MOVL	runtime·m0+m_tls(SB), AX
   131  	CMPL	AX, $0x123
   132  	JEQ	ok
   133  	MOVL	AX, 0	// abort
   134  ok:
   135  	// set up m and g "registers"
   136  	get_tls(BX)
   137  	LEAL	runtime·g0(SB), DX
   138  	MOVL	DX, g(BX)
   139  	LEAL	runtime·m0(SB), AX
   140  
   141  	// save m->g0 = g0
   142  	MOVL	DX, m_g0(AX)
   143  	// save g0->m = m0
   144  	MOVL	AX, g_m(DX)
   145  
   146  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   147  
   148  	// convention is D is always cleared
   149  	CLD
   150  
   151  	CALL	runtime·check(SB)
   152  
   153  	// saved argc, argv
   154  	MOVL	120(SP), AX
   155  	MOVL	AX, 0(SP)
   156  	MOVL	124(SP), AX
   157  	MOVL	AX, 4(SP)
   158  	CALL	runtime·args(SB)
   159  	CALL	runtime·osinit(SB)
   160  	CALL	runtime·schedinit(SB)
   161  
   162  	// create a new goroutine to start program
   163  	PUSHL	$runtime·mainPC(SB)	// entry
   164  	PUSHL	$0	// arg size
   165  	CALL	runtime·newproc(SB)
   166  	POPL	AX
   167  	POPL	AX
   168  
   169  	// start this M
   170  	CALL	runtime·mstart(SB)
   171  
   172  	INT $3
   173  	RET
   174  
   175  DATA	bad_proc_msg<>+0x00(SB)/8, $"This pro"
   176  DATA	bad_proc_msg<>+0x08(SB)/8, $"gram can"
   177  DATA	bad_proc_msg<>+0x10(SB)/8, $" only be"
   178  DATA	bad_proc_msg<>+0x18(SB)/8, $" run on "
   179  DATA	bad_proc_msg<>+0x20(SB)/8, $"processo"
   180  DATA	bad_proc_msg<>+0x28(SB)/8, $"rs with "
   181  DATA	bad_proc_msg<>+0x30(SB)/8, $"MMX supp"
   182  DATA	bad_proc_msg<>+0x38(SB)/4, $"ort."
   183  DATA	bad_proc_msg<>+0x3c(SB)/1, $0xa
   184  GLOBL	bad_proc_msg<>(SB), RODATA, $0x3d
   185  
   186  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   187  GLOBL	runtime·mainPC(SB),RODATA,$4
   188  
   189  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   190  	INT $3
   191  	RET
   192  
   193  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   194  	// Linux and MinGW start the FPU in extended double precision.
   195  	// Other operating systems use double precision.
   196  	// Change to double precision to match them,
   197  	// and to match other hardware that only has double.
   198  	FLDCW	runtime·controlWord64(SB)
   199  	RET
   200  
   201  /*
   202   *  go-routine
   203   */
   204  
   205  // void gosave(Gobuf*)
   206  // save state in Gobuf; setjmp
   207  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   208  	MOVL	buf+0(FP), AX		// gobuf
   209  	LEAL	buf+0(FP), BX		// caller's SP
   210  	MOVL	BX, gobuf_sp(AX)
   211  	MOVL	0(SP), BX		// caller's PC
   212  	MOVL	BX, gobuf_pc(AX)
   213  	MOVL	$0, gobuf_ret(AX)
   214  	// Assert ctxt is zero. See func save.
   215  	MOVL	gobuf_ctxt(AX), BX
   216  	TESTL	BX, BX
   217  	JZ	2(PC)
   218  	CALL	runtime·badctxt(SB)
   219  	get_tls(CX)
   220  	MOVL	g(CX), BX
   221  	MOVL	BX, gobuf_g(AX)
   222  	RET
   223  
   224  // void gogo(Gobuf*)
   225  // restore state from Gobuf; longjmp
   226  TEXT runtime·gogo(SB), NOSPLIT, $8-4
   227  	MOVL	buf+0(FP), BX		// gobuf
   228  
   229  	// If ctxt is not nil, invoke deletion barrier before overwriting.
   230  	MOVL	gobuf_ctxt(BX), DX
   231  	TESTL	DX, DX
   232  	JZ	nilctxt
   233  	LEAL	gobuf_ctxt(BX), AX
   234  	MOVL	AX, 0(SP)
   235  	MOVL	$0, 4(SP)
   236  	CALL	runtime·writebarrierptr_prewrite(SB)
   237  	MOVL	buf+0(FP), BX
   238  
   239  nilctxt:
   240  	MOVL	gobuf_g(BX), DX
   241  	MOVL	0(DX), CX		// make sure g != nil
   242  	get_tls(CX)
   243  	MOVL	DX, g(CX)
   244  	MOVL	gobuf_sp(BX), SP	// restore SP
   245  	MOVL	gobuf_ret(BX), AX
   246  	MOVL	gobuf_ctxt(BX), DX
   247  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   248  	MOVL	$0, gobuf_ret(BX)
   249  	MOVL	$0, gobuf_ctxt(BX)
   250  	MOVL	gobuf_pc(BX), BX
   251  	JMP	BX
   252  
   253  // func mcall(fn func(*g))
   254  // Switch to m->g0's stack, call fn(g).
   255  // Fn must never return. It should gogo(&g->sched)
   256  // to keep running g.
   257  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   258  	MOVL	fn+0(FP), DI
   259  
   260  	get_tls(DX)
   261  	MOVL	g(DX), AX	// save state in g->sched
   262  	MOVL	0(SP), BX	// caller's PC
   263  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   264  	LEAL	fn+0(FP), BX	// caller's SP
   265  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   266  	MOVL	AX, (g_sched+gobuf_g)(AX)
   267  
   268  	// switch to m->g0 & its stack, call fn
   269  	MOVL	g(DX), BX
   270  	MOVL	g_m(BX), BX
   271  	MOVL	m_g0(BX), SI
   272  	CMPL	SI, AX	// if g == m->g0 call badmcall
   273  	JNE	3(PC)
   274  	MOVL	$runtime·badmcall(SB), AX
   275  	JMP	AX
   276  	MOVL	SI, g(DX)	// g = m->g0
   277  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   278  	PUSHL	AX
   279  	MOVL	DI, DX
   280  	MOVL	0(DI), DI
   281  	CALL	DI
   282  	POPL	AX
   283  	MOVL	$runtime·badmcall2(SB), AX
   284  	JMP	AX
   285  	RET
   286  
   287  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   288  // of the G stack. We need to distinguish the routine that
   289  // lives at the bottom of the G stack from the one that lives
   290  // at the top of the system stack because the one at the top of
   291  // the system stack terminates the stack walk (see topofstack()).
   292  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   293  	RET
   294  
   295  // func systemstack(fn func())
   296  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   297  	MOVL	fn+0(FP), DI	// DI = fn
   298  	get_tls(CX)
   299  	MOVL	g(CX), AX	// AX = g
   300  	MOVL	g_m(AX), BX	// BX = m
   301  
   302  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   303  	CMPL	AX, DX
   304  	JEQ	noswitch
   305  
   306  	MOVL	m_g0(BX), DX	// DX = g0
   307  	CMPL	AX, DX
   308  	JEQ	noswitch
   309  
   310  	MOVL	m_curg(BX), BP
   311  	CMPL	AX, BP
   312  	JEQ	switch
   313  	
   314  	// Bad: g is not gsignal, not g0, not curg. What is it?
   315  	// Hide call from linker nosplit analysis.
   316  	MOVL	$runtime·badsystemstack(SB), AX
   317  	CALL	AX
   318  
   319  switch:
   320  	// save our state in g->sched. Pretend to
   321  	// be systemstack_switch if the G stack is scanned.
   322  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   323  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   324  	MOVL	AX, (g_sched+gobuf_g)(AX)
   325  
   326  	// switch to g0
   327  	get_tls(CX)
   328  	MOVL	DX, g(CX)
   329  	MOVL	(g_sched+gobuf_sp)(DX), BX
   330  	// make it look like mstart called systemstack on g0, to stop traceback
   331  	SUBL	$4, BX
   332  	MOVL	$runtime·mstart(SB), DX
   333  	MOVL	DX, 0(BX)
   334  	MOVL	BX, SP
   335  
   336  	// call target function
   337  	MOVL	DI, DX
   338  	MOVL	0(DI), DI
   339  	CALL	DI
   340  
   341  	// switch back to g
   342  	get_tls(CX)
   343  	MOVL	g(CX), AX
   344  	MOVL	g_m(AX), BX
   345  	MOVL	m_curg(BX), AX
   346  	MOVL	AX, g(CX)
   347  	MOVL	(g_sched+gobuf_sp)(AX), SP
   348  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   349  	RET
   350  
   351  noswitch:
   352  	// already on system stack, just call directly
   353  	MOVL	DI, DX
   354  	MOVL	0(DI), DI
   355  	CALL	DI
   356  	RET
   357  
   358  /*
   359   * support for morestack
   360   */
   361  
   362  // Called during function prolog when more stack is needed.
   363  //
   364  // The traceback routines see morestack on a g0 as being
   365  // the top of a stack (for example, morestack calling newstack
   366  // calling the scheduler calling newm calling gc), so we must
   367  // record an argument size. For that purpose, it has no arguments.
   368  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   369  	// Cannot grow scheduler stack (m->g0).
   370  	get_tls(CX)
   371  	MOVL	g(CX), BX
   372  	MOVL	g_m(BX), BX
   373  	MOVL	m_g0(BX), SI
   374  	CMPL	g(CX), SI
   375  	JNE	3(PC)
   376  	CALL	runtime·badmorestackg0(SB)
   377  	INT	$3
   378  
   379  	// Cannot grow signal stack.
   380  	MOVL	m_gsignal(BX), SI
   381  	CMPL	g(CX), SI
   382  	JNE	3(PC)
   383  	CALL	runtime·badmorestackgsignal(SB)
   384  	INT	$3
   385  
   386  	// Called from f.
   387  	// Set m->morebuf to f's caller.
   388  	MOVL	4(SP), DI	// f's caller's PC
   389  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   390  	LEAL	8(SP), CX	// f's caller's SP
   391  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   392  	get_tls(CX)
   393  	MOVL	g(CX), SI
   394  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   395  
   396  	// Set g->sched to context in f.
   397  	MOVL	0(SP), AX	// f's PC
   398  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   399  	MOVL	SI, (g_sched+gobuf_g)(SI)
   400  	LEAL	4(SP), AX	// f's SP
   401  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   402  	// newstack will fill gobuf.ctxt.
   403  
   404  	// Call newstack on m->g0's stack.
   405  	MOVL	m_g0(BX), BP
   406  	MOVL	BP, g(CX)
   407  	MOVL	(g_sched+gobuf_sp)(BP), AX
   408  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   409  	MOVL	AX, SP
   410  	PUSHL	DX	// ctxt argument
   411  	CALL	runtime·newstack(SB)
   412  	MOVL	$0, 0x1003	// crash if newstack returns
   413  	POPL	DX	// keep balance check happy
   414  	RET
   415  
   416  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   417  	MOVL	$0, DX
   418  	JMP runtime·morestack(SB)
   419  
   420  // reflectcall: call a function with the given argument list
   421  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   422  // we don't have variable-sized frames, so we use a small number
   423  // of constant-sized-frame functions to encode a few bits of size in the pc.
   424  // Caution: ugly multiline assembly macros in your future!
   425  
   426  #define DISPATCH(NAME,MAXSIZE)		\
   427  	CMPL	CX, $MAXSIZE;		\
   428  	JA	3(PC);			\
   429  	MOVL	$NAME(SB), AX;		\
   430  	JMP	AX
   431  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   432  
   433  TEXT reflect·call(SB), NOSPLIT, $0-0
   434  	JMP	·reflectcall(SB)
   435  
   436  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   437  	MOVL	argsize+12(FP), CX
   438  	DISPATCH(runtime·call16, 16)
   439  	DISPATCH(runtime·call32, 32)
   440  	DISPATCH(runtime·call64, 64)
   441  	DISPATCH(runtime·call128, 128)
   442  	DISPATCH(runtime·call256, 256)
   443  	DISPATCH(runtime·call512, 512)
   444  	DISPATCH(runtime·call1024, 1024)
   445  	DISPATCH(runtime·call2048, 2048)
   446  	DISPATCH(runtime·call4096, 4096)
   447  	DISPATCH(runtime·call8192, 8192)
   448  	DISPATCH(runtime·call16384, 16384)
   449  	DISPATCH(runtime·call32768, 32768)
   450  	DISPATCH(runtime·call65536, 65536)
   451  	DISPATCH(runtime·call131072, 131072)
   452  	DISPATCH(runtime·call262144, 262144)
   453  	DISPATCH(runtime·call524288, 524288)
   454  	DISPATCH(runtime·call1048576, 1048576)
   455  	DISPATCH(runtime·call2097152, 2097152)
   456  	DISPATCH(runtime·call4194304, 4194304)
   457  	DISPATCH(runtime·call8388608, 8388608)
   458  	DISPATCH(runtime·call16777216, 16777216)
   459  	DISPATCH(runtime·call33554432, 33554432)
   460  	DISPATCH(runtime·call67108864, 67108864)
   461  	DISPATCH(runtime·call134217728, 134217728)
   462  	DISPATCH(runtime·call268435456, 268435456)
   463  	DISPATCH(runtime·call536870912, 536870912)
   464  	DISPATCH(runtime·call1073741824, 1073741824)
   465  	MOVL	$runtime·badreflectcall(SB), AX
   466  	JMP	AX
   467  
   468  #define CALLFN(NAME,MAXSIZE)			\
   469  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   470  	NO_LOCAL_POINTERS;			\
   471  	/* copy arguments to stack */		\
   472  	MOVL	argptr+8(FP), SI;		\
   473  	MOVL	argsize+12(FP), CX;		\
   474  	MOVL	SP, DI;				\
   475  	REP;MOVSB;				\
   476  	/* call function */			\
   477  	MOVL	f+4(FP), DX;			\
   478  	MOVL	(DX), AX; 			\
   479  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   480  	CALL	AX;				\
   481  	/* copy return values back */		\
   482  	MOVL	argtype+0(FP), DX;		\
   483  	MOVL	argptr+8(FP), DI;		\
   484  	MOVL	argsize+12(FP), CX;		\
   485  	MOVL	retoffset+16(FP), BX;		\
   486  	MOVL	SP, SI;				\
   487  	ADDL	BX, DI;				\
   488  	ADDL	BX, SI;				\
   489  	SUBL	BX, CX;				\
   490  	CALL	callRet<>(SB);			\
   491  	RET
   492  
   493  // callRet copies return values back at the end of call*. This is a
   494  // separate function so it can allocate stack space for the arguments
   495  // to reflectcallmove. It does not follow the Go ABI; it expects its
   496  // arguments in registers.
   497  TEXT callRet<>(SB), NOSPLIT, $16-0
   498  	MOVL	DX, 0(SP)
   499  	MOVL	DI, 4(SP)
   500  	MOVL	SI, 8(SP)
   501  	MOVL	CX, 12(SP)
   502  	CALL	runtime·reflectcallmove(SB)
   503  	RET
   504  
   505  CALLFN(·call16, 16)
   506  CALLFN(·call32, 32)
   507  CALLFN(·call64, 64)
   508  CALLFN(·call128, 128)
   509  CALLFN(·call256, 256)
   510  CALLFN(·call512, 512)
   511  CALLFN(·call1024, 1024)
   512  CALLFN(·call2048, 2048)
   513  CALLFN(·call4096, 4096)
   514  CALLFN(·call8192, 8192)
   515  CALLFN(·call16384, 16384)
   516  CALLFN(·call32768, 32768)
   517  CALLFN(·call65536, 65536)
   518  CALLFN(·call131072, 131072)
   519  CALLFN(·call262144, 262144)
   520  CALLFN(·call524288, 524288)
   521  CALLFN(·call1048576, 1048576)
   522  CALLFN(·call2097152, 2097152)
   523  CALLFN(·call4194304, 4194304)
   524  CALLFN(·call8388608, 8388608)
   525  CALLFN(·call16777216, 16777216)
   526  CALLFN(·call33554432, 33554432)
   527  CALLFN(·call67108864, 67108864)
   528  CALLFN(·call134217728, 134217728)
   529  CALLFN(·call268435456, 268435456)
   530  CALLFN(·call536870912, 536870912)
   531  CALLFN(·call1073741824, 1073741824)
   532  
   533  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   534  	MOVL	cycles+0(FP), AX
   535  again:
   536  	PAUSE
   537  	SUBL	$1, AX
   538  	JNZ	again
   539  	RET
   540  
   541  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   542  	// Stores are already ordered on x86, so this is just a
   543  	// compile barrier.
   544  	RET
   545  
   546  // void jmpdefer(fn, sp);
   547  // called from deferreturn.
   548  // 1. pop the caller
   549  // 2. sub 5 bytes (the length of CALL & a 32 bit displacement) from the callers
   550  //    return (when building for shared libraries, subtract 16 bytes -- 5 bytes
   551  //    for CALL & displacement to call __x86.get_pc_thunk.cx, 6 bytes for the
   552  //    LEAL to load the offset into BX, and finally 5 for the call & displacement)
   553  // 3. jmp to the argument
   554  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   555  	MOVL	fv+0(FP), DX	// fn
   556  	MOVL	argp+4(FP), BX	// caller sp
   557  	LEAL	-4(BX), SP	// caller sp after CALL
   558  #ifdef GOBUILDMODE_shared
   559  	SUBL	$16, (SP)	// return to CALL again
   560  #else
   561  	SUBL	$5, (SP)	// return to CALL again
   562  #endif
   563  	MOVL	0(DX), BX
   564  	JMP	BX	// but first run the deferred function
   565  
   566  // Save state of caller into g->sched.
   567  TEXT gosave<>(SB),NOSPLIT,$0
   568  	PUSHL	AX
   569  	PUSHL	BX
   570  	get_tls(BX)
   571  	MOVL	g(BX), BX
   572  	LEAL	arg+0(FP), AX
   573  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   574  	MOVL	-4(AX), AX
   575  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   576  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   577  	// Assert ctxt is zero. See func save.
   578  	MOVL	(g_sched+gobuf_ctxt)(BX), AX
   579  	TESTL	AX, AX
   580  	JZ	2(PC)
   581  	CALL	runtime·badctxt(SB)
   582  	POPL	BX
   583  	POPL	AX
   584  	RET
   585  
   586  // func asmcgocall(fn, arg unsafe.Pointer) int32
   587  // Call fn(arg) on the scheduler stack,
   588  // aligned appropriately for the gcc ABI.
   589  // See cgocall.go for more details.
   590  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   591  	MOVL	fn+0(FP), AX
   592  	MOVL	arg+4(FP), BX
   593  
   594  	MOVL	SP, DX
   595  
   596  	// Figure out if we need to switch to m->g0 stack.
   597  	// We get called to create new OS threads too, and those
   598  	// come in on the m->g0 stack already.
   599  	get_tls(CX)
   600  	MOVL	g(CX), BP
   601  	MOVL	g_m(BP), BP
   602  	MOVL	m_g0(BP), SI
   603  	MOVL	g(CX), DI
   604  	CMPL	SI, DI
   605  	JEQ	noswitch
   606  	CALL	gosave<>(SB)
   607  	get_tls(CX)
   608  	MOVL	SI, g(CX)
   609  	MOVL	(g_sched+gobuf_sp)(SI), SP
   610  
   611  noswitch:
   612  	// Now on a scheduling stack (a pthread-created stack).
   613  	SUBL	$32, SP
   614  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   615  	MOVL	DI, 8(SP)	// save g
   616  	MOVL	(g_stack+stack_hi)(DI), DI
   617  	SUBL	DX, DI
   618  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   619  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   620  	CALL	AX
   621  
   622  	// Restore registers, g, stack pointer.
   623  	get_tls(CX)
   624  	MOVL	8(SP), DI
   625  	MOVL	(g_stack+stack_hi)(DI), SI
   626  	SUBL	4(SP), SI
   627  	MOVL	DI, g(CX)
   628  	MOVL	SI, SP
   629  
   630  	MOVL	AX, ret+8(FP)
   631  	RET
   632  
   633  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   634  // Turn the fn into a Go func (by taking its address) and call
   635  // cgocallback_gofunc.
   636  TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
   637  	LEAL	fn+0(FP), AX
   638  	MOVL	AX, 0(SP)
   639  	MOVL	frame+4(FP), AX
   640  	MOVL	AX, 4(SP)
   641  	MOVL	framesize+8(FP), AX
   642  	MOVL	AX, 8(SP)
   643  	MOVL	ctxt+12(FP), AX
   644  	MOVL	AX, 12(SP)
   645  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   646  	CALL	AX
   647  	RET
   648  
   649  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   650  // See cgocall.go for more details.
   651  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
   652  	NO_LOCAL_POINTERS
   653  
   654  	// If g is nil, Go did not create the current thread.
   655  	// Call needm to obtain one for temporary use.
   656  	// In this case, we're running on the thread stack, so there's
   657  	// lots of space, but the linker doesn't know. Hide the call from
   658  	// the linker analysis by using an indirect call through AX.
   659  	get_tls(CX)
   660  #ifdef GOOS_windows
   661  	MOVL	$0, BP
   662  	CMPL	CX, $0
   663  	JEQ	2(PC) // TODO
   664  #endif
   665  	MOVL	g(CX), BP
   666  	CMPL	BP, $0
   667  	JEQ	needm
   668  	MOVL	g_m(BP), BP
   669  	MOVL	BP, DX // saved copy of oldm
   670  	JMP	havem
   671  needm:
   672  	MOVL	$0, 0(SP)
   673  	MOVL	$runtime·needm(SB), AX
   674  	CALL	AX
   675  	MOVL	0(SP), DX
   676  	get_tls(CX)
   677  	MOVL	g(CX), BP
   678  	MOVL	g_m(BP), BP
   679  
   680  	// Set m->sched.sp = SP, so that if a panic happens
   681  	// during the function we are about to execute, it will
   682  	// have a valid SP to run on the g0 stack.
   683  	// The next few lines (after the havem label)
   684  	// will save this SP onto the stack and then write
   685  	// the same SP back to m->sched.sp. That seems redundant,
   686  	// but if an unrecovered panic happens, unwindm will
   687  	// restore the g->sched.sp from the stack location
   688  	// and then systemstack will try to use it. If we don't set it here,
   689  	// that restored SP will be uninitialized (typically 0) and
   690  	// will not be usable.
   691  	MOVL	m_g0(BP), SI
   692  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   693  
   694  havem:
   695  	// Now there's a valid m, and we're running on its m->g0.
   696  	// Save current m->g0->sched.sp on stack and then set it to SP.
   697  	// Save current sp in m->g0->sched.sp in preparation for
   698  	// switch back to m->curg stack.
   699  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   700  	MOVL	m_g0(BP), SI
   701  	MOVL	(g_sched+gobuf_sp)(SI), AX
   702  	MOVL	AX, 0(SP)
   703  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   704  
   705  	// Switch to m->curg stack and call runtime.cgocallbackg.
   706  	// Because we are taking over the execution of m->curg
   707  	// but *not* resuming what had been running, we need to
   708  	// save that information (m->curg->sched) so we can restore it.
   709  	// We can restore m->curg->sched.sp easily, because calling
   710  	// runtime.cgocallbackg leaves SP unchanged upon return.
   711  	// To save m->curg->sched.pc, we push it onto the stack.
   712  	// This has the added benefit that it looks to the traceback
   713  	// routine like cgocallbackg is going to return to that
   714  	// PC (because the frame we allocate below has the same
   715  	// size as cgocallback_gofunc's frame declared above)
   716  	// so that the traceback will seamlessly trace back into
   717  	// the earlier calls.
   718  	//
   719  	// In the new goroutine, 4(SP) holds the saved oldm (DX) register.
   720  	// 8(SP) is unused.
   721  	MOVL	m_curg(BP), SI
   722  	MOVL	SI, g(CX)
   723  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   724  	MOVL	(g_sched+gobuf_pc)(SI), BP
   725  	MOVL	BP, -4(DI)
   726  	MOVL	ctxt+12(FP), CX
   727  	LEAL	-(4+12)(DI), SP
   728  	MOVL	DX, 4(SP)
   729  	MOVL	CX, 0(SP)
   730  	CALL	runtime·cgocallbackg(SB)
   731  	MOVL	4(SP), DX
   732  
   733  	// Restore g->sched (== m->curg->sched) from saved values.
   734  	get_tls(CX)
   735  	MOVL	g(CX), SI
   736  	MOVL	12(SP), BP
   737  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   738  	LEAL	(12+4)(SP), DI
   739  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   740  
   741  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   742  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   743  	// so we do not have to restore it.)
   744  	MOVL	g(CX), BP
   745  	MOVL	g_m(BP), BP
   746  	MOVL	m_g0(BP), SI
   747  	MOVL	SI, g(CX)
   748  	MOVL	(g_sched+gobuf_sp)(SI), SP
   749  	MOVL	0(SP), AX
   750  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   751  	
   752  	// If the m on entry was nil, we called needm above to borrow an m
   753  	// for the duration of the call. Since the call is over, return it with dropm.
   754  	CMPL	DX, $0
   755  	JNE 3(PC)
   756  	MOVL	$runtime·dropm(SB), AX
   757  	CALL	AX
   758  
   759  	// Done!
   760  	RET
   761  
   762  // void setg(G*); set g. for use by needm.
   763  TEXT runtime·setg(SB), NOSPLIT, $0-4
   764  	MOVL	gg+0(FP), BX
   765  #ifdef GOOS_windows
   766  	CMPL	BX, $0
   767  	JNE	settls
   768  	MOVL	$0, 0x14(FS)
   769  	RET
   770  settls:
   771  	MOVL	g_m(BX), AX
   772  	LEAL	m_tls(AX), AX
   773  	MOVL	AX, 0x14(FS)
   774  #endif
   775  	get_tls(CX)
   776  	MOVL	BX, g(CX)
   777  	RET
   778  
   779  // void setg_gcc(G*); set g. for use by gcc
   780  TEXT setg_gcc<>(SB), NOSPLIT, $0
   781  	get_tls(AX)
   782  	MOVL	gg+0(FP), DX
   783  	MOVL	DX, g(AX)
   784  	RET
   785  
   786  // check that SP is in range [g->stack.lo, g->stack.hi)
   787  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   788  	get_tls(CX)
   789  	MOVL	g(CX), AX
   790  	CMPL	(g_stack+stack_hi)(AX), SP
   791  	JHI	2(PC)
   792  	INT	$3
   793  	CMPL	SP, (g_stack+stack_lo)(AX)
   794  	JHI	2(PC)
   795  	INT	$3
   796  	RET
   797  
   798  TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
   799  	MOVL	argp+0(FP),AX		// addr of first arg
   800  	MOVL	-4(AX),AX		// get calling pc
   801  	MOVL	AX, ret+4(FP)
   802  	RET
   803  
   804  // func cputicks() int64
   805  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   806  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence
   807  	JEQ	done
   808  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   809  	JNE	mfence
   810  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   811  	JMP	done
   812  mfence:
   813  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   814  done:
   815  	RDTSC
   816  	MOVL	AX, ret_lo+0(FP)
   817  	MOVL	DX, ret_hi+4(FP)
   818  	RET
   819  
   820  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   821  	// set up ldt 7 to point at m0.tls
   822  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   823  	// the entry number is just a hint.  setldt will set up GS with what it used.
   824  	MOVL	$7, 0(SP)
   825  	LEAL	runtime·m0+m_tls(SB), AX
   826  	MOVL	AX, 4(SP)
   827  	MOVL	$32, 8(SP)	// sizeof(tls array)
   828  	CALL	runtime·setldt(SB)
   829  	RET
   830  
   831  TEXT runtime·emptyfunc(SB),0,$0-0
   832  	RET
   833  
   834  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   835  // redirects to memhash(p, h, size) using the size
   836  // stored in the closure.
   837  TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
   838  	GO_ARGS
   839  	NO_LOCAL_POINTERS
   840  	MOVL	p+0(FP), AX
   841  	MOVL	h+4(FP), BX
   842  	MOVL	4(DX), CX
   843  	MOVL	AX, 0(SP)
   844  	MOVL	BX, 4(SP)
   845  	MOVL	CX, 8(SP)
   846  	CALL	runtime·memhash(SB)
   847  	MOVL	12(SP), AX
   848  	MOVL	AX, ret+8(FP)
   849  	RET
   850  
   851  // hash function using AES hardware instructions
   852  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   853  	MOVL	p+0(FP), AX	// ptr to data
   854  	MOVL	s+8(FP), BX	// size
   855  	LEAL	ret+12(FP), DX
   856  	JMP	runtime·aeshashbody(SB)
   857  
   858  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   859  	MOVL	p+0(FP), AX	// ptr to string object
   860  	MOVL	4(AX), BX	// length of string
   861  	MOVL	(AX), AX	// string data
   862  	LEAL	ret+8(FP), DX
   863  	JMP	runtime·aeshashbody(SB)
   864  
   865  // AX: data
   866  // BX: length
   867  // DX: address to put return value
   868  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   869  	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
   870  	PINSRW	$4, BX, X0	            // 16 bits of length
   871  	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
   872  	MOVO	X0, X1                      // save unscrambled seed
   873  	PXOR	runtime·aeskeysched(SB), X0 // xor in per-process seed
   874  	AESENC	X0, X0                      // scramble seed
   875  
   876  	CMPL	BX, $16
   877  	JB	aes0to15
   878  	JE	aes16
   879  	CMPL	BX, $32
   880  	JBE	aes17to32
   881  	CMPL	BX, $64
   882  	JBE	aes33to64
   883  	JMP	aes65plus
   884  	
   885  aes0to15:
   886  	TESTL	BX, BX
   887  	JE	aes0
   888  
   889  	ADDL	$16, AX
   890  	TESTW	$0xff0, AX
   891  	JE	endofpage
   892  
   893  	// 16 bytes loaded at this address won't cross
   894  	// a page boundary, so we can load it directly.
   895  	MOVOU	-16(AX), X1
   896  	ADDL	BX, BX
   897  	PAND	masks<>(SB)(BX*8), X1
   898  
   899  final1:	
   900  	AESENC	X0, X1  // scramble input, xor in seed
   901  	AESENC	X1, X1  // scramble combo 2 times
   902  	AESENC	X1, X1
   903  	MOVL	X1, (DX)
   904  	RET
   905  
   906  endofpage:
   907  	// address ends in 1111xxxx. Might be up against
   908  	// a page boundary, so load ending at last byte.
   909  	// Then shift bytes down using pshufb.
   910  	MOVOU	-32(AX)(BX*1), X1
   911  	ADDL	BX, BX
   912  	PSHUFB	shifts<>(SB)(BX*8), X1
   913  	JMP	final1
   914  
   915  aes0:
   916  	// Return scrambled input seed
   917  	AESENC	X0, X0
   918  	MOVL	X0, (DX)
   919  	RET
   920  
   921  aes16:
   922  	MOVOU	(AX), X1
   923  	JMP	final1
   924  
   925  aes17to32:
   926  	// make second starting seed
   927  	PXOR	runtime·aeskeysched+16(SB), X1
   928  	AESENC	X1, X1
   929  	
   930  	// load data to be hashed
   931  	MOVOU	(AX), X2
   932  	MOVOU	-16(AX)(BX*1), X3
   933  
   934  	// scramble 3 times
   935  	AESENC	X0, X2
   936  	AESENC	X1, X3
   937  	AESENC	X2, X2
   938  	AESENC	X3, X3
   939  	AESENC	X2, X2
   940  	AESENC	X3, X3
   941  
   942  	// combine results
   943  	PXOR	X3, X2
   944  	MOVL	X2, (DX)
   945  	RET
   946  
   947  aes33to64:
   948  	// make 3 more starting seeds
   949  	MOVO	X1, X2
   950  	MOVO	X1, X3
   951  	PXOR	runtime·aeskeysched+16(SB), X1
   952  	PXOR	runtime·aeskeysched+32(SB), X2
   953  	PXOR	runtime·aeskeysched+48(SB), X3
   954  	AESENC	X1, X1
   955  	AESENC	X2, X2
   956  	AESENC	X3, X3
   957  	
   958  	MOVOU	(AX), X4
   959  	MOVOU	16(AX), X5
   960  	MOVOU	-32(AX)(BX*1), X6
   961  	MOVOU	-16(AX)(BX*1), X7
   962  	
   963  	AESENC	X0, X4
   964  	AESENC	X1, X5
   965  	AESENC	X2, X6
   966  	AESENC	X3, X7
   967  	
   968  	AESENC	X4, X4
   969  	AESENC	X5, X5
   970  	AESENC	X6, X6
   971  	AESENC	X7, X7
   972  	
   973  	AESENC	X4, X4
   974  	AESENC	X5, X5
   975  	AESENC	X6, X6
   976  	AESENC	X7, X7
   977  
   978  	PXOR	X6, X4
   979  	PXOR	X7, X5
   980  	PXOR	X5, X4
   981  	MOVL	X4, (DX)
   982  	RET
   983  
   984  aes65plus:
   985  	// make 3 more starting seeds
   986  	MOVO	X1, X2
   987  	MOVO	X1, X3
   988  	PXOR	runtime·aeskeysched+16(SB), X1
   989  	PXOR	runtime·aeskeysched+32(SB), X2
   990  	PXOR	runtime·aeskeysched+48(SB), X3
   991  	AESENC	X1, X1
   992  	AESENC	X2, X2
   993  	AESENC	X3, X3
   994  	
   995  	// start with last (possibly overlapping) block
   996  	MOVOU	-64(AX)(BX*1), X4
   997  	MOVOU	-48(AX)(BX*1), X5
   998  	MOVOU	-32(AX)(BX*1), X6
   999  	MOVOU	-16(AX)(BX*1), X7
  1000  
  1001  	// scramble state once
  1002  	AESENC	X0, X4
  1003  	AESENC	X1, X5
  1004  	AESENC	X2, X6
  1005  	AESENC	X3, X7
  1006  
  1007  	// compute number of remaining 64-byte blocks
  1008  	DECL	BX
  1009  	SHRL	$6, BX
  1010  	
  1011  aesloop:
  1012  	// scramble state, xor in a block
  1013  	MOVOU	(AX), X0
  1014  	MOVOU	16(AX), X1
  1015  	MOVOU	32(AX), X2
  1016  	MOVOU	48(AX), X3
  1017  	AESENC	X0, X4
  1018  	AESENC	X1, X5
  1019  	AESENC	X2, X6
  1020  	AESENC	X3, X7
  1021  
  1022  	// scramble state
  1023  	AESENC	X4, X4
  1024  	AESENC	X5, X5
  1025  	AESENC	X6, X6
  1026  	AESENC	X7, X7
  1027  
  1028  	ADDL	$64, AX
  1029  	DECL	BX
  1030  	JNE	aesloop
  1031  
  1032  	// 2 more scrambles to finish
  1033  	AESENC	X4, X4
  1034  	AESENC	X5, X5
  1035  	AESENC	X6, X6
  1036  	AESENC	X7, X7
  1037  	
  1038  	AESENC	X4, X4
  1039  	AESENC	X5, X5
  1040  	AESENC	X6, X6
  1041  	AESENC	X7, X7
  1042  
  1043  	PXOR	X6, X4
  1044  	PXOR	X7, X5
  1045  	PXOR	X5, X4
  1046  	MOVL	X4, (DX)
  1047  	RET
  1048  
  1049  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1050  	MOVL	p+0(FP), AX	// ptr to data
  1051  	MOVL	h+4(FP), X0	// seed
  1052  	PINSRD	$1, (AX), X0	// data
  1053  	AESENC	runtime·aeskeysched+0(SB), X0
  1054  	AESENC	runtime·aeskeysched+16(SB), X0
  1055  	AESENC	runtime·aeskeysched+32(SB), X0
  1056  	MOVL	X0, ret+8(FP)
  1057  	RET
  1058  
  1059  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1060  	MOVL	p+0(FP), AX	// ptr to data
  1061  	MOVQ	(AX), X0	// data
  1062  	PINSRD	$2, h+4(FP), X0	// seed
  1063  	AESENC	runtime·aeskeysched+0(SB), X0
  1064  	AESENC	runtime·aeskeysched+16(SB), X0
  1065  	AESENC	runtime·aeskeysched+32(SB), X0
  1066  	MOVL	X0, ret+8(FP)
  1067  	RET
  1068  
  1069  // simple mask to get rid of data in the high part of the register.
  1070  DATA masks<>+0x00(SB)/4, $0x00000000
  1071  DATA masks<>+0x04(SB)/4, $0x00000000
  1072  DATA masks<>+0x08(SB)/4, $0x00000000
  1073  DATA masks<>+0x0c(SB)/4, $0x00000000
  1074  	
  1075  DATA masks<>+0x10(SB)/4, $0x000000ff
  1076  DATA masks<>+0x14(SB)/4, $0x00000000
  1077  DATA masks<>+0x18(SB)/4, $0x00000000
  1078  DATA masks<>+0x1c(SB)/4, $0x00000000
  1079  	
  1080  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1081  DATA masks<>+0x24(SB)/4, $0x00000000
  1082  DATA masks<>+0x28(SB)/4, $0x00000000
  1083  DATA masks<>+0x2c(SB)/4, $0x00000000
  1084  	
  1085  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1086  DATA masks<>+0x34(SB)/4, $0x00000000
  1087  DATA masks<>+0x38(SB)/4, $0x00000000
  1088  DATA masks<>+0x3c(SB)/4, $0x00000000
  1089  	
  1090  DATA masks<>+0x40(SB)/4, $0xffffffff
  1091  DATA masks<>+0x44(SB)/4, $0x00000000
  1092  DATA masks<>+0x48(SB)/4, $0x00000000
  1093  DATA masks<>+0x4c(SB)/4, $0x00000000
  1094  	
  1095  DATA masks<>+0x50(SB)/4, $0xffffffff
  1096  DATA masks<>+0x54(SB)/4, $0x000000ff
  1097  DATA masks<>+0x58(SB)/4, $0x00000000
  1098  DATA masks<>+0x5c(SB)/4, $0x00000000
  1099  	
  1100  DATA masks<>+0x60(SB)/4, $0xffffffff
  1101  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1102  DATA masks<>+0x68(SB)/4, $0x00000000
  1103  DATA masks<>+0x6c(SB)/4, $0x00000000
  1104  	
  1105  DATA masks<>+0x70(SB)/4, $0xffffffff
  1106  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1107  DATA masks<>+0x78(SB)/4, $0x00000000
  1108  DATA masks<>+0x7c(SB)/4, $0x00000000
  1109  	
  1110  DATA masks<>+0x80(SB)/4, $0xffffffff
  1111  DATA masks<>+0x84(SB)/4, $0xffffffff
  1112  DATA masks<>+0x88(SB)/4, $0x00000000
  1113  DATA masks<>+0x8c(SB)/4, $0x00000000
  1114  	
  1115  DATA masks<>+0x90(SB)/4, $0xffffffff
  1116  DATA masks<>+0x94(SB)/4, $0xffffffff
  1117  DATA masks<>+0x98(SB)/4, $0x000000ff
  1118  DATA masks<>+0x9c(SB)/4, $0x00000000
  1119  	
  1120  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1121  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1122  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1123  DATA masks<>+0xac(SB)/4, $0x00000000
  1124  	
  1125  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1126  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1127  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1128  DATA masks<>+0xbc(SB)/4, $0x00000000
  1129  	
  1130  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1131  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1132  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1133  DATA masks<>+0xcc(SB)/4, $0x00000000
  1134  	
  1135  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1136  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1137  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1138  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1139  	
  1140  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1141  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1142  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1143  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1144  	
  1145  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1146  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1147  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1148  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1149  
  1150  GLOBL masks<>(SB),RODATA,$256
  1151  
  1152  // these are arguments to pshufb. They move data down from
  1153  // the high bytes of the register to the low bytes of the register.
  1154  // index is how many bytes to move.
  1155  DATA shifts<>+0x00(SB)/4, $0x00000000
  1156  DATA shifts<>+0x04(SB)/4, $0x00000000
  1157  DATA shifts<>+0x08(SB)/4, $0x00000000
  1158  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1159  	
  1160  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1161  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1162  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1163  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1164  	
  1165  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1166  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1167  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1168  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1169  	
  1170  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1171  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1172  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1173  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1174  	
  1175  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1176  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1177  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1178  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1179  	
  1180  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1181  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1182  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1183  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1184  	
  1185  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1186  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1187  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1188  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1189  	
  1190  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1191  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1192  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1193  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1194  	
  1195  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1196  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1197  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1198  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1199  	
  1200  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1201  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1202  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1203  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1204  	
  1205  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1206  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1207  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1208  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1209  	
  1210  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1211  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1212  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1213  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1214  	
  1215  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1216  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1217  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1218  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1219  	
  1220  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1221  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1222  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1223  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1224  	
  1225  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1226  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1227  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1228  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1229  	
  1230  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1231  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1232  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1233  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1234  
  1235  GLOBL shifts<>(SB),RODATA,$256
  1236  
  1237  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1238  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1239  	MOVL	$masks<>(SB), AX
  1240  	MOVL	$shifts<>(SB), BX
  1241  	ORL	BX, AX
  1242  	TESTL	$15, AX
  1243  	SETEQ	ret+0(FP)
  1244  	RET
  1245  
  1246  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1247  TEXT runtime·memequal(SB),NOSPLIT,$0-13
  1248  	MOVL	a+0(FP), SI
  1249  	MOVL	b+4(FP), DI
  1250  	CMPL	SI, DI
  1251  	JEQ	eq
  1252  	MOVL	size+8(FP), BX
  1253  	LEAL	ret+12(FP), AX
  1254  	JMP	runtime·memeqbody(SB)
  1255  eq:
  1256  	MOVB    $1, ret+12(FP)
  1257  	RET
  1258  
  1259  // memequal_varlen(a, b unsafe.Pointer) bool
  1260  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1261  	MOVL    a+0(FP), SI
  1262  	MOVL    b+4(FP), DI
  1263  	CMPL    SI, DI
  1264  	JEQ     eq
  1265  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1266  	LEAL	ret+8(FP), AX
  1267  	JMP	runtime·memeqbody(SB)
  1268  eq:
  1269  	MOVB    $1, ret+8(FP)
  1270  	RET
  1271  
  1272  // eqstring tests whether two strings are equal.
  1273  // The compiler guarantees that strings passed
  1274  // to eqstring have equal length.
  1275  // See runtime_test.go:eqstring_generic for
  1276  // equivalent Go code.
  1277  TEXT runtime·eqstring(SB),NOSPLIT,$0-17
  1278  	MOVL	s1_base+0(FP), SI
  1279  	MOVL	s2_base+8(FP), DI
  1280  	CMPL	SI, DI
  1281  	JEQ	same
  1282  	MOVL	s1_len+4(FP), BX
  1283  	LEAL	ret+16(FP), AX
  1284  	JMP	runtime·memeqbody(SB)
  1285  same:
  1286  	MOVB	$1, ret+16(FP)
  1287  	RET
  1288  
  1289  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1290  	MOVL	a_len+4(FP), BX
  1291  	MOVL	b_len+16(FP), CX
  1292  	CMPL	BX, CX
  1293  	JNE	eqret
  1294  	MOVL	a+0(FP), SI
  1295  	MOVL	b+12(FP), DI
  1296  	LEAL	ret+24(FP), AX
  1297  	JMP	runtime·memeqbody(SB)
  1298  eqret:
  1299  	MOVB	$0, ret+24(FP)
  1300  	RET
  1301  
  1302  // a in SI
  1303  // b in DI
  1304  // count in BX
  1305  // address of result byte in AX
  1306  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1307  	CMPL	BX, $4
  1308  	JB	small
  1309  
  1310  	// 64 bytes at a time using xmm registers
  1311  hugeloop:
  1312  	CMPL	BX, $64
  1313  	JB	bigloop
  1314  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1315  	JE	bigloop
  1316  	MOVOU	(SI), X0
  1317  	MOVOU	(DI), X1
  1318  	MOVOU	16(SI), X2
  1319  	MOVOU	16(DI), X3
  1320  	MOVOU	32(SI), X4
  1321  	MOVOU	32(DI), X5
  1322  	MOVOU	48(SI), X6
  1323  	MOVOU	48(DI), X7
  1324  	PCMPEQB	X1, X0
  1325  	PCMPEQB	X3, X2
  1326  	PCMPEQB	X5, X4
  1327  	PCMPEQB	X7, X6
  1328  	PAND	X2, X0
  1329  	PAND	X6, X4
  1330  	PAND	X4, X0
  1331  	PMOVMSKB X0, DX
  1332  	ADDL	$64, SI
  1333  	ADDL	$64, DI
  1334  	SUBL	$64, BX
  1335  	CMPL	DX, $0xffff
  1336  	JEQ	hugeloop
  1337  	MOVB	$0, (AX)
  1338  	RET
  1339  
  1340  	// 4 bytes at a time using 32-bit register
  1341  bigloop:
  1342  	CMPL	BX, $4
  1343  	JBE	leftover
  1344  	MOVL	(SI), CX
  1345  	MOVL	(DI), DX
  1346  	ADDL	$4, SI
  1347  	ADDL	$4, DI
  1348  	SUBL	$4, BX
  1349  	CMPL	CX, DX
  1350  	JEQ	bigloop
  1351  	MOVB	$0, (AX)
  1352  	RET
  1353  
  1354  	// remaining 0-4 bytes
  1355  leftover:
  1356  	MOVL	-4(SI)(BX*1), CX
  1357  	MOVL	-4(DI)(BX*1), DX
  1358  	CMPL	CX, DX
  1359  	SETEQ	(AX)
  1360  	RET
  1361  
  1362  small:
  1363  	CMPL	BX, $0
  1364  	JEQ	equal
  1365  
  1366  	LEAL	0(BX*8), CX
  1367  	NEGL	CX
  1368  
  1369  	MOVL	SI, DX
  1370  	CMPB	DX, $0xfc
  1371  	JA	si_high
  1372  
  1373  	// load at SI won't cross a page boundary.
  1374  	MOVL	(SI), SI
  1375  	JMP	si_finish
  1376  si_high:
  1377  	// address ends in 111111xx. Load up to bytes we want, move to correct position.
  1378  	MOVL	-4(SI)(BX*1), SI
  1379  	SHRL	CX, SI
  1380  si_finish:
  1381  
  1382  	// same for DI.
  1383  	MOVL	DI, DX
  1384  	CMPB	DX, $0xfc
  1385  	JA	di_high
  1386  	MOVL	(DI), DI
  1387  	JMP	di_finish
  1388  di_high:
  1389  	MOVL	-4(DI)(BX*1), DI
  1390  	SHRL	CX, DI
  1391  di_finish:
  1392  
  1393  	SUBL	SI, DI
  1394  	SHLL	CX, DI
  1395  equal:
  1396  	SETEQ	(AX)
  1397  	RET
  1398  
  1399  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1400  	MOVL	s1_base+0(FP), SI
  1401  	MOVL	s1_len+4(FP), BX
  1402  	MOVL	s2_base+8(FP), DI
  1403  	MOVL	s2_len+12(FP), DX
  1404  	LEAL	ret+16(FP), AX
  1405  	JMP	runtime·cmpbody(SB)
  1406  
  1407  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1408  	MOVL	s1+0(FP), SI
  1409  	MOVL	s1+4(FP), BX
  1410  	MOVL	s2+12(FP), DI
  1411  	MOVL	s2+16(FP), DX
  1412  	LEAL	ret+24(FP), AX
  1413  	JMP	runtime·cmpbody(SB)
  1414  
  1415  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1416  	MOVL	s+0(FP), SI
  1417  	MOVL	s_len+4(FP), CX
  1418  	MOVB	c+12(FP), AL
  1419  	MOVL	SI, DI
  1420  	CLD; REPN; SCASB
  1421  	JZ 3(PC)
  1422  	MOVL	$-1, ret+16(FP)
  1423  	RET
  1424  	SUBL	SI, DI
  1425  	SUBL	$1, DI
  1426  	MOVL	DI, ret+16(FP)
  1427  	RET
  1428  
  1429  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1430  	MOVL	s+0(FP), SI
  1431  	MOVL	s_len+4(FP), CX
  1432  	MOVB	c+8(FP), AL
  1433  	MOVL	SI, DI
  1434  	CLD; REPN; SCASB
  1435  	JZ 3(PC)
  1436  	MOVL	$-1, ret+12(FP)
  1437  	RET
  1438  	SUBL	SI, DI
  1439  	SUBL	$1, DI
  1440  	MOVL	DI, ret+12(FP)
  1441  	RET
  1442  
  1443  // input:
  1444  //   SI = a
  1445  //   DI = b
  1446  //   BX = alen
  1447  //   DX = blen
  1448  //   AX = address of return word (set to 1/0/-1)
  1449  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1450  	MOVL	DX, BP
  1451  	SUBL	BX, DX // DX = blen-alen
  1452  	JLE	2(PC)
  1453  	MOVL	BX, BP // BP = min(alen, blen)
  1454  	CMPL	SI, DI
  1455  	JEQ	allsame
  1456  	CMPL	BP, $4
  1457  	JB	small
  1458  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1459  	JE	mediumloop
  1460  largeloop:
  1461  	CMPL	BP, $16
  1462  	JB	mediumloop
  1463  	MOVOU	(SI), X0
  1464  	MOVOU	(DI), X1
  1465  	PCMPEQB X0, X1
  1466  	PMOVMSKB X1, BX
  1467  	XORL	$0xffff, BX	// convert EQ to NE
  1468  	JNE	diff16	// branch if at least one byte is not equal
  1469  	ADDL	$16, SI
  1470  	ADDL	$16, DI
  1471  	SUBL	$16, BP
  1472  	JMP	largeloop
  1473  
  1474  diff16:
  1475  	BSFL	BX, BX	// index of first byte that differs
  1476  	XORL	DX, DX
  1477  	MOVB	(SI)(BX*1), CX
  1478  	CMPB	CX, (DI)(BX*1)
  1479  	SETHI	DX
  1480  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1481  	MOVL	DX, (AX)
  1482  	RET
  1483  
  1484  mediumloop:
  1485  	CMPL	BP, $4
  1486  	JBE	_0through4
  1487  	MOVL	(SI), BX
  1488  	MOVL	(DI), CX
  1489  	CMPL	BX, CX
  1490  	JNE	diff4
  1491  	ADDL	$4, SI
  1492  	ADDL	$4, DI
  1493  	SUBL	$4, BP
  1494  	JMP	mediumloop
  1495  
  1496  _0through4:
  1497  	MOVL	-4(SI)(BP*1), BX
  1498  	MOVL	-4(DI)(BP*1), CX
  1499  	CMPL	BX, CX
  1500  	JEQ	allsame
  1501  
  1502  diff4:
  1503  	BSWAPL	BX	// reverse order of bytes
  1504  	BSWAPL	CX
  1505  	XORL	BX, CX	// find bit differences
  1506  	BSRL	CX, CX	// index of highest bit difference
  1507  	SHRL	CX, BX	// move a's bit to bottom
  1508  	ANDL	$1, BX	// mask bit
  1509  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1510  	MOVL	BX, (AX)
  1511  	RET
  1512  
  1513  	// 0-3 bytes in common
  1514  small:
  1515  	LEAL	(BP*8), CX
  1516  	NEGL	CX
  1517  	JEQ	allsame
  1518  
  1519  	// load si
  1520  	CMPB	SI, $0xfc
  1521  	JA	si_high
  1522  	MOVL	(SI), SI
  1523  	JMP	si_finish
  1524  si_high:
  1525  	MOVL	-4(SI)(BP*1), SI
  1526  	SHRL	CX, SI
  1527  si_finish:
  1528  	SHLL	CX, SI
  1529  
  1530  	// same for di
  1531  	CMPB	DI, $0xfc
  1532  	JA	di_high
  1533  	MOVL	(DI), DI
  1534  	JMP	di_finish
  1535  di_high:
  1536  	MOVL	-4(DI)(BP*1), DI
  1537  	SHRL	CX, DI
  1538  di_finish:
  1539  	SHLL	CX, DI
  1540  
  1541  	BSWAPL	SI	// reverse order of bytes
  1542  	BSWAPL	DI
  1543  	XORL	SI, DI	// find bit differences
  1544  	JEQ	allsame
  1545  	BSRL	DI, CX	// index of highest bit difference
  1546  	SHRL	CX, SI	// move a's bit to bottom
  1547  	ANDL	$1, SI	// mask bit
  1548  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1549  	MOVL	BX, (AX)
  1550  	RET
  1551  
  1552  	// all the bytes in common are the same, so we just need
  1553  	// to compare the lengths.
  1554  allsame:
  1555  	XORL	BX, BX
  1556  	XORL	CX, CX
  1557  	TESTL	DX, DX
  1558  	SETLT	BX	// 1 if alen > blen
  1559  	SETEQ	CX	// 1 if alen == blen
  1560  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1561  	MOVL	BX, (AX)
  1562  	RET
  1563  
  1564  TEXT runtime·return0(SB), NOSPLIT, $0
  1565  	MOVL	$0, AX
  1566  	RET
  1567  
  1568  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1569  // Must obey the gcc calling convention.
  1570  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1571  	get_tls(CX)
  1572  	MOVL	g(CX), AX
  1573  	MOVL	g_m(AX), AX
  1574  	MOVL	m_curg(AX), AX
  1575  	MOVL	(g_stack+stack_hi)(AX), AX
  1576  	RET
  1577  
  1578  // The top-most function running on a goroutine
  1579  // returns to goexit+PCQuantum.
  1580  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1581  	BYTE	$0x90	// NOP
  1582  	CALL	runtime·goexit1(SB)	// does not return
  1583  	// traceback from goexit1 must hit code range of goexit
  1584  	BYTE	$0x90	// NOP
  1585  
  1586  // Prefetching doesn't seem to help.
  1587  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
  1588  	RET
  1589  
  1590  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
  1591  	RET
  1592  
  1593  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
  1594  	RET
  1595  
  1596  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
  1597  	RET
  1598  
  1599  // Add a module's moduledata to the linked list of moduledata objects. This
  1600  // is called from .init_array by a function generated in the linker and so
  1601  // follows the platform ABI wrt register preservation -- it only touches AX,
  1602  // CX (implicitly) and DX, but it does not follow the ABI wrt arguments:
  1603  // instead the pointer to the moduledata is passed in AX.
  1604  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1605         MOVL    runtime·lastmoduledatap(SB), DX
  1606         MOVL    AX, moduledata_next(DX)
  1607         MOVL    AX, runtime·lastmoduledatap(SB)
  1608         RET
  1609  
  1610  TEXT runtime·uint32tofloat64(SB),NOSPLIT,$8-12
  1611  	MOVL	a+0(FP), AX
  1612  	MOVL	AX, 0(SP)
  1613  	MOVL	$0, 4(SP)
  1614  	FMOVV	0(SP), F0
  1615  	FMOVDP	F0, ret+4(FP)
  1616  	RET
  1617  
  1618  TEXT runtime·float64touint32(SB),NOSPLIT,$12-12
  1619  	FMOVD	a+0(FP), F0
  1620  	FSTCW	0(SP)
  1621  	FLDCW	runtime·controlWord64trunc(SB)
  1622  	FMOVVP	F0, 4(SP)
  1623  	FLDCW	0(SP)
  1624  	MOVL	4(SP), AX
  1625  	MOVL	AX, ret+8(FP)
  1626  	RET