github.com/eun/go@v0.0.0-20170811110501-92cfd07a6cfd/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	SUBL	$128, SP		// plenty of scratch
    15  	ANDL	$~15, SP
    16  	MOVL	AX, 120(SP)		// save argc, argv away
    17  	MOVL	BX, 124(SP)
    18  
    19  	// set default stack bounds.
    20  	// _cgo_init may update stackguard.
    21  	MOVL	$runtime·g0(SB), BP
    22  	LEAL	(-64*1024+104)(SP), BX
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	BX, g_stackguard1(BP)
    25  	MOVL	BX, (g_stack+stack_lo)(BP)
    26  	MOVL	SP, (g_stack+stack_hi)(BP)
    27  	
    28  	// find out information about the processor we're on
    29  #ifdef GOOS_nacl // NaCl doesn't like PUSHFL/POPFL
    30  	JMP 	has_cpuid
    31  #else
    32  	// first see if CPUID instruction is supported.
    33  	PUSHFL
    34  	PUSHFL
    35  	XORL	$(1<<21), 0(SP) // flip ID bit
    36  	POPFL
    37  	PUSHFL
    38  	POPL	AX
    39  	XORL	0(SP), AX
    40  	POPFL	// restore EFLAGS
    41  	TESTL	$(1<<21), AX
    42  	JNE 	has_cpuid
    43  #endif
    44  
    45  bad_proc: // show that the program requires MMX.
    46  	MOVL	$2, 0(SP)
    47  	MOVL	$bad_proc_msg<>(SB), 4(SP)
    48  	MOVL	$0x3d, 8(SP)
    49  	CALL	runtime·write(SB)
    50  	MOVL	$1, 0(SP)
    51  	CALL	runtime·exit(SB)
    52  	INT	$3
    53  
    54  has_cpuid:
    55  	MOVL	$0, AX
    56  	CPUID
    57  	MOVL	AX, SI
    58  	CMPL	AX, $0
    59  	JE	nocpuinfo
    60  
    61  	// Figure out how to serialize RDTSC.
    62  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    63  	// Don't know about the rest, so let's do MFENCE.
    64  	CMPL	BX, $0x756E6547  // "Genu"
    65  	JNE	notintel
    66  	CMPL	DX, $0x49656E69  // "ineI"
    67  	JNE	notintel
    68  	CMPL	CX, $0x6C65746E  // "ntel"
    69  	JNE	notintel
    70  	MOVB	$1, runtime·isIntel(SB)
    71  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    72  notintel:
    73  
    74  	// Load EAX=1 cpuid flags
    75  	MOVL	$1, AX
    76  	CPUID
    77  	MOVL	CX, DI // Move to global variable clobbers CX when generating PIC
    78  	MOVL	AX, runtime·processorVersionInfo(SB)
    79  
    80  	// Check for MMX support
    81  	TESTL	$(1<<23), DX // MMX
    82  	JZ	bad_proc
    83  
    84  	TESTL	$(1<<26), DX // SSE2
    85  	SETNE	runtime·support_sse2(SB)
    86  
    87  	TESTL	$(1<<9), DI // SSSE3
    88  	SETNE	runtime·support_ssse3(SB)
    89  
    90  	TESTL	$(1<<19), DI // SSE4.1
    91  	SETNE	runtime·support_sse41(SB)
    92  
    93  	TESTL	$(1<<20), DI // SSE4.2
    94  	SETNE	runtime·support_sse42(SB)
    95  
    96  	TESTL	$(1<<23), DI // POPCNT
    97  	SETNE	runtime·support_popcnt(SB)
    98  
    99  	TESTL	$(1<<25), DI // AES
   100  	SETNE	runtime·support_aes(SB)
   101  
   102  	TESTL	$(1<<27), DI // OSXSAVE
   103  	SETNE	runtime·support_osxsave(SB)
   104  
   105  	// If OS support for XMM and YMM is not present
   106  	// support_avx will be set back to false later.
   107  	TESTL	$(1<<28), DI // AVX
   108  	SETNE	runtime·support_avx(SB)
   109  
   110  eax7:
   111  	// Load EAX=7/ECX=0 cpuid flags
   112  	CMPL	SI, $7
   113  	JLT	osavx
   114  	MOVL	$7, AX
   115  	MOVL	$0, CX
   116  	CPUID
   117  
   118  	TESTL	$(1<<3), BX // BMI1
   119  	SETNE	runtime·support_bmi1(SB)
   120  
   121  	// If OS support for XMM and YMM is not present
   122  	// support_avx2 will be set back to false later.
   123  	TESTL	$(1<<5), BX
   124  	SETNE	runtime·support_avx2(SB)
   125  
   126  	TESTL	$(1<<8), BX // BMI2
   127  	SETNE	runtime·support_bmi2(SB)
   128  
   129  	TESTL	$(1<<9), BX // ERMS
   130  	SETNE	runtime·support_erms(SB)
   131  
   132  osavx:
   133  	// nacl does not support XGETBV to test
   134  	// for XMM and YMM OS support.
   135  #ifndef GOOS_nacl
   136  	CMPB	runtime·support_osxsave(SB), $1
   137  	JNE	noavx
   138  	MOVL	$0, CX
   139  	// For XGETBV, OSXSAVE bit is required and sufficient
   140  	XGETBV
   141  	ANDL	$6, AX
   142  	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
   143  	JE nocpuinfo
   144  #endif
   145  noavx:
   146  	MOVB $0, runtime·support_avx(SB)
   147  	MOVB $0, runtime·support_avx2(SB)
   148  
   149  nocpuinfo:
   150  	// if there is an _cgo_init, call it to let it
   151  	// initialize and to set up GS.  if not,
   152  	// we set up GS ourselves.
   153  	MOVL	_cgo_init(SB), AX
   154  	TESTL	AX, AX
   155  	JZ	needtls
   156  	MOVL	$setg_gcc<>(SB), BX
   157  	MOVL	BX, 4(SP)
   158  	MOVL	BP, 0(SP)
   159  	CALL	AX
   160  
   161  	// update stackguard after _cgo_init
   162  	MOVL	$runtime·g0(SB), CX
   163  	MOVL	(g_stack+stack_lo)(CX), AX
   164  	ADDL	$const__StackGuard, AX
   165  	MOVL	AX, g_stackguard0(CX)
   166  	MOVL	AX, g_stackguard1(CX)
   167  
   168  #ifndef GOOS_windows
   169  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
   170  	JMP ok
   171  #endif
   172  needtls:
   173  #ifdef GOOS_plan9
   174  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
   175  	JMP	ok
   176  #endif
   177  
   178  	// set up %gs
   179  	CALL	runtime·ldt0setup(SB)
   180  
   181  	// store through it, to make sure it works
   182  	get_tls(BX)
   183  	MOVL	$0x123, g(BX)
   184  	MOVL	runtime·m0+m_tls(SB), AX
   185  	CMPL	AX, $0x123
   186  	JEQ	ok
   187  	MOVL	AX, 0	// abort
   188  ok:
   189  	// set up m and g "registers"
   190  	get_tls(BX)
   191  	LEAL	runtime·g0(SB), DX
   192  	MOVL	DX, g(BX)
   193  	LEAL	runtime·m0(SB), AX
   194  
   195  	// save m->g0 = g0
   196  	MOVL	DX, m_g0(AX)
   197  	// save g0->m = m0
   198  	MOVL	AX, g_m(DX)
   199  
   200  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   201  
   202  	// convention is D is always cleared
   203  	CLD
   204  
   205  	CALL	runtime·check(SB)
   206  
   207  	// saved argc, argv
   208  	MOVL	120(SP), AX
   209  	MOVL	AX, 0(SP)
   210  	MOVL	124(SP), AX
   211  	MOVL	AX, 4(SP)
   212  	CALL	runtime·args(SB)
   213  	CALL	runtime·osinit(SB)
   214  	CALL	runtime·schedinit(SB)
   215  
   216  	// create a new goroutine to start program
   217  	PUSHL	$runtime·mainPC(SB)	// entry
   218  	PUSHL	$0	// arg size
   219  	CALL	runtime·newproc(SB)
   220  	POPL	AX
   221  	POPL	AX
   222  
   223  	// start this M
   224  	CALL	runtime·mstart(SB)
   225  
   226  	INT $3
   227  	RET
   228  
   229  DATA	bad_proc_msg<>+0x00(SB)/8, $"This pro"
   230  DATA	bad_proc_msg<>+0x08(SB)/8, $"gram can"
   231  DATA	bad_proc_msg<>+0x10(SB)/8, $" only be"
   232  DATA	bad_proc_msg<>+0x18(SB)/8, $" run on "
   233  DATA	bad_proc_msg<>+0x20(SB)/8, $"processo"
   234  DATA	bad_proc_msg<>+0x28(SB)/8, $"rs with "
   235  DATA	bad_proc_msg<>+0x30(SB)/8, $"MMX supp"
   236  DATA	bad_proc_msg<>+0x38(SB)/4, $"ort."
   237  DATA	bad_proc_msg<>+0x3c(SB)/1, $0xa
   238  GLOBL	bad_proc_msg<>(SB), RODATA, $0x3d
   239  
   240  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   241  GLOBL	runtime·mainPC(SB),RODATA,$4
   242  
   243  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   244  	INT $3
   245  	RET
   246  
   247  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   248  	// Linux and MinGW start the FPU in extended double precision.
   249  	// Other operating systems use double precision.
   250  	// Change to double precision to match them,
   251  	// and to match other hardware that only has double.
   252  	FLDCW	runtime·controlWord64(SB)
   253  	RET
   254  
   255  /*
   256   *  go-routine
   257   */
   258  
   259  // void gosave(Gobuf*)
   260  // save state in Gobuf; setjmp
   261  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   262  	MOVL	buf+0(FP), AX		// gobuf
   263  	LEAL	buf+0(FP), BX		// caller's SP
   264  	MOVL	BX, gobuf_sp(AX)
   265  	MOVL	0(SP), BX		// caller's PC
   266  	MOVL	BX, gobuf_pc(AX)
   267  	MOVL	$0, gobuf_ret(AX)
   268  	// Assert ctxt is zero. See func save.
   269  	MOVL	gobuf_ctxt(AX), BX
   270  	TESTL	BX, BX
   271  	JZ	2(PC)
   272  	CALL	runtime·badctxt(SB)
   273  	get_tls(CX)
   274  	MOVL	g(CX), BX
   275  	MOVL	BX, gobuf_g(AX)
   276  	RET
   277  
   278  // void gogo(Gobuf*)
   279  // restore state from Gobuf; longjmp
   280  TEXT runtime·gogo(SB), NOSPLIT, $8-4
   281  	MOVL	buf+0(FP), BX		// gobuf
   282  
   283  	// If ctxt is not nil, invoke deletion barrier before overwriting.
   284  	MOVL	gobuf_ctxt(BX), DX
   285  	TESTL	DX, DX
   286  	JZ	nilctxt
   287  	LEAL	gobuf_ctxt(BX), AX
   288  	MOVL	AX, 0(SP)
   289  	MOVL	$0, 4(SP)
   290  	CALL	runtime·writebarrierptr_prewrite(SB)
   291  	MOVL	buf+0(FP), BX
   292  
   293  nilctxt:
   294  	MOVL	gobuf_g(BX), DX
   295  	MOVL	0(DX), CX		// make sure g != nil
   296  	get_tls(CX)
   297  	MOVL	DX, g(CX)
   298  	MOVL	gobuf_sp(BX), SP	// restore SP
   299  	MOVL	gobuf_ret(BX), AX
   300  	MOVL	gobuf_ctxt(BX), DX
   301  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   302  	MOVL	$0, gobuf_ret(BX)
   303  	MOVL	$0, gobuf_ctxt(BX)
   304  	MOVL	gobuf_pc(BX), BX
   305  	JMP	BX
   306  
   307  // func mcall(fn func(*g))
   308  // Switch to m->g0's stack, call fn(g).
   309  // Fn must never return. It should gogo(&g->sched)
   310  // to keep running g.
   311  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   312  	MOVL	fn+0(FP), DI
   313  
   314  	get_tls(DX)
   315  	MOVL	g(DX), AX	// save state in g->sched
   316  	MOVL	0(SP), BX	// caller's PC
   317  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   318  	LEAL	fn+0(FP), BX	// caller's SP
   319  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   320  	MOVL	AX, (g_sched+gobuf_g)(AX)
   321  
   322  	// switch to m->g0 & its stack, call fn
   323  	MOVL	g(DX), BX
   324  	MOVL	g_m(BX), BX
   325  	MOVL	m_g0(BX), SI
   326  	CMPL	SI, AX	// if g == m->g0 call badmcall
   327  	JNE	3(PC)
   328  	MOVL	$runtime·badmcall(SB), AX
   329  	JMP	AX
   330  	MOVL	SI, g(DX)	// g = m->g0
   331  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   332  	PUSHL	AX
   333  	MOVL	DI, DX
   334  	MOVL	0(DI), DI
   335  	CALL	DI
   336  	POPL	AX
   337  	MOVL	$runtime·badmcall2(SB), AX
   338  	JMP	AX
   339  	RET
   340  
   341  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   342  // of the G stack. We need to distinguish the routine that
   343  // lives at the bottom of the G stack from the one that lives
   344  // at the top of the system stack because the one at the top of
   345  // the system stack terminates the stack walk (see topofstack()).
   346  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   347  	RET
   348  
   349  // func systemstack(fn func())
   350  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   351  	MOVL	fn+0(FP), DI	// DI = fn
   352  	get_tls(CX)
   353  	MOVL	g(CX), AX	// AX = g
   354  	MOVL	g_m(AX), BX	// BX = m
   355  
   356  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   357  	CMPL	AX, DX
   358  	JEQ	noswitch
   359  
   360  	MOVL	m_g0(BX), DX	// DX = g0
   361  	CMPL	AX, DX
   362  	JEQ	noswitch
   363  
   364  	MOVL	m_curg(BX), BP
   365  	CMPL	AX, BP
   366  	JEQ	switch
   367  	
   368  	// Bad: g is not gsignal, not g0, not curg. What is it?
   369  	// Hide call from linker nosplit analysis.
   370  	MOVL	$runtime·badsystemstack(SB), AX
   371  	CALL	AX
   372  
   373  switch:
   374  	// save our state in g->sched. Pretend to
   375  	// be systemstack_switch if the G stack is scanned.
   376  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   377  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   378  	MOVL	AX, (g_sched+gobuf_g)(AX)
   379  
   380  	// switch to g0
   381  	get_tls(CX)
   382  	MOVL	DX, g(CX)
   383  	MOVL	(g_sched+gobuf_sp)(DX), BX
   384  	// make it look like mstart called systemstack on g0, to stop traceback
   385  	SUBL	$4, BX
   386  	MOVL	$runtime·mstart(SB), DX
   387  	MOVL	DX, 0(BX)
   388  	MOVL	BX, SP
   389  
   390  	// call target function
   391  	MOVL	DI, DX
   392  	MOVL	0(DI), DI
   393  	CALL	DI
   394  
   395  	// switch back to g
   396  	get_tls(CX)
   397  	MOVL	g(CX), AX
   398  	MOVL	g_m(AX), BX
   399  	MOVL	m_curg(BX), AX
   400  	MOVL	AX, g(CX)
   401  	MOVL	(g_sched+gobuf_sp)(AX), SP
   402  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   403  	RET
   404  
   405  noswitch:
   406  	// already on system stack, just call directly
   407  	MOVL	DI, DX
   408  	MOVL	0(DI), DI
   409  	CALL	DI
   410  	RET
   411  
   412  /*
   413   * support for morestack
   414   */
   415  
   416  // Called during function prolog when more stack is needed.
   417  //
   418  // The traceback routines see morestack on a g0 as being
   419  // the top of a stack (for example, morestack calling newstack
   420  // calling the scheduler calling newm calling gc), so we must
   421  // record an argument size. For that purpose, it has no arguments.
   422  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   423  	// Cannot grow scheduler stack (m->g0).
   424  	get_tls(CX)
   425  	MOVL	g(CX), BX
   426  	MOVL	g_m(BX), BX
   427  	MOVL	m_g0(BX), SI
   428  	CMPL	g(CX), SI
   429  	JNE	3(PC)
   430  	CALL	runtime·badmorestackg0(SB)
   431  	INT	$3
   432  
   433  	// Cannot grow signal stack.
   434  	MOVL	m_gsignal(BX), SI
   435  	CMPL	g(CX), SI
   436  	JNE	3(PC)
   437  	CALL	runtime·badmorestackgsignal(SB)
   438  	INT	$3
   439  
   440  	// Called from f.
   441  	// Set m->morebuf to f's caller.
   442  	MOVL	4(SP), DI	// f's caller's PC
   443  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   444  	LEAL	8(SP), CX	// f's caller's SP
   445  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   446  	get_tls(CX)
   447  	MOVL	g(CX), SI
   448  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   449  
   450  	// Set g->sched to context in f.
   451  	MOVL	0(SP), AX	// f's PC
   452  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   453  	MOVL	SI, (g_sched+gobuf_g)(SI)
   454  	LEAL	4(SP), AX	// f's SP
   455  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   456  	// newstack will fill gobuf.ctxt.
   457  
   458  	// Call newstack on m->g0's stack.
   459  	MOVL	m_g0(BX), BP
   460  	MOVL	BP, g(CX)
   461  	MOVL	(g_sched+gobuf_sp)(BP), AX
   462  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   463  	MOVL	AX, SP
   464  	PUSHL	DX	// ctxt argument
   465  	CALL	runtime·newstack(SB)
   466  	MOVL	$0, 0x1003	// crash if newstack returns
   467  	POPL	DX	// keep balance check happy
   468  	RET
   469  
   470  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   471  	MOVL	$0, DX
   472  	JMP runtime·morestack(SB)
   473  
   474  // reflectcall: call a function with the given argument list
   475  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   476  // we don't have variable-sized frames, so we use a small number
   477  // of constant-sized-frame functions to encode a few bits of size in the pc.
   478  // Caution: ugly multiline assembly macros in your future!
   479  
   480  #define DISPATCH(NAME,MAXSIZE)		\
   481  	CMPL	CX, $MAXSIZE;		\
   482  	JA	3(PC);			\
   483  	MOVL	$NAME(SB), AX;		\
   484  	JMP	AX
   485  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   486  
   487  TEXT reflect·call(SB), NOSPLIT, $0-0
   488  	JMP	·reflectcall(SB)
   489  
   490  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   491  	MOVL	argsize+12(FP), CX
   492  	DISPATCH(runtime·call16, 16)
   493  	DISPATCH(runtime·call32, 32)
   494  	DISPATCH(runtime·call64, 64)
   495  	DISPATCH(runtime·call128, 128)
   496  	DISPATCH(runtime·call256, 256)
   497  	DISPATCH(runtime·call512, 512)
   498  	DISPATCH(runtime·call1024, 1024)
   499  	DISPATCH(runtime·call2048, 2048)
   500  	DISPATCH(runtime·call4096, 4096)
   501  	DISPATCH(runtime·call8192, 8192)
   502  	DISPATCH(runtime·call16384, 16384)
   503  	DISPATCH(runtime·call32768, 32768)
   504  	DISPATCH(runtime·call65536, 65536)
   505  	DISPATCH(runtime·call131072, 131072)
   506  	DISPATCH(runtime·call262144, 262144)
   507  	DISPATCH(runtime·call524288, 524288)
   508  	DISPATCH(runtime·call1048576, 1048576)
   509  	DISPATCH(runtime·call2097152, 2097152)
   510  	DISPATCH(runtime·call4194304, 4194304)
   511  	DISPATCH(runtime·call8388608, 8388608)
   512  	DISPATCH(runtime·call16777216, 16777216)
   513  	DISPATCH(runtime·call33554432, 33554432)
   514  	DISPATCH(runtime·call67108864, 67108864)
   515  	DISPATCH(runtime·call134217728, 134217728)
   516  	DISPATCH(runtime·call268435456, 268435456)
   517  	DISPATCH(runtime·call536870912, 536870912)
   518  	DISPATCH(runtime·call1073741824, 1073741824)
   519  	MOVL	$runtime·badreflectcall(SB), AX
   520  	JMP	AX
   521  
   522  #define CALLFN(NAME,MAXSIZE)			\
   523  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   524  	NO_LOCAL_POINTERS;			\
   525  	/* copy arguments to stack */		\
   526  	MOVL	argptr+8(FP), SI;		\
   527  	MOVL	argsize+12(FP), CX;		\
   528  	MOVL	SP, DI;				\
   529  	REP;MOVSB;				\
   530  	/* call function */			\
   531  	MOVL	f+4(FP), DX;			\
   532  	MOVL	(DX), AX; 			\
   533  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   534  	CALL	AX;				\
   535  	/* copy return values back */		\
   536  	MOVL	argtype+0(FP), DX;		\
   537  	MOVL	argptr+8(FP), DI;		\
   538  	MOVL	argsize+12(FP), CX;		\
   539  	MOVL	retoffset+16(FP), BX;		\
   540  	MOVL	SP, SI;				\
   541  	ADDL	BX, DI;				\
   542  	ADDL	BX, SI;				\
   543  	SUBL	BX, CX;				\
   544  	CALL	callRet<>(SB);			\
   545  	RET
   546  
   547  // callRet copies return values back at the end of call*. This is a
   548  // separate function so it can allocate stack space for the arguments
   549  // to reflectcallmove. It does not follow the Go ABI; it expects its
   550  // arguments in registers.
   551  TEXT callRet<>(SB), NOSPLIT, $16-0
   552  	MOVL	DX, 0(SP)
   553  	MOVL	DI, 4(SP)
   554  	MOVL	SI, 8(SP)
   555  	MOVL	CX, 12(SP)
   556  	CALL	runtime·reflectcallmove(SB)
   557  	RET
   558  
   559  CALLFN(·call16, 16)
   560  CALLFN(·call32, 32)
   561  CALLFN(·call64, 64)
   562  CALLFN(·call128, 128)
   563  CALLFN(·call256, 256)
   564  CALLFN(·call512, 512)
   565  CALLFN(·call1024, 1024)
   566  CALLFN(·call2048, 2048)
   567  CALLFN(·call4096, 4096)
   568  CALLFN(·call8192, 8192)
   569  CALLFN(·call16384, 16384)
   570  CALLFN(·call32768, 32768)
   571  CALLFN(·call65536, 65536)
   572  CALLFN(·call131072, 131072)
   573  CALLFN(·call262144, 262144)
   574  CALLFN(·call524288, 524288)
   575  CALLFN(·call1048576, 1048576)
   576  CALLFN(·call2097152, 2097152)
   577  CALLFN(·call4194304, 4194304)
   578  CALLFN(·call8388608, 8388608)
   579  CALLFN(·call16777216, 16777216)
   580  CALLFN(·call33554432, 33554432)
   581  CALLFN(·call67108864, 67108864)
   582  CALLFN(·call134217728, 134217728)
   583  CALLFN(·call268435456, 268435456)
   584  CALLFN(·call536870912, 536870912)
   585  CALLFN(·call1073741824, 1073741824)
   586  
   587  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   588  	MOVL	cycles+0(FP), AX
   589  again:
   590  	PAUSE
   591  	SUBL	$1, AX
   592  	JNZ	again
   593  	RET
   594  
   595  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   596  	// Stores are already ordered on x86, so this is just a
   597  	// compile barrier.
   598  	RET
   599  
   600  // void jmpdefer(fn, sp);
   601  // called from deferreturn.
   602  // 1. pop the caller
   603  // 2. sub 5 bytes (the length of CALL & a 32 bit displacement) from the callers
   604  //    return (when building for shared libraries, subtract 16 bytes -- 5 bytes
   605  //    for CALL & displacement to call __x86.get_pc_thunk.cx, 6 bytes for the
   606  //    LEAL to load the offset into BX, and finally 5 for the call & displacement)
   607  // 3. jmp to the argument
   608  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   609  	MOVL	fv+0(FP), DX	// fn
   610  	MOVL	argp+4(FP), BX	// caller sp
   611  	LEAL	-4(BX), SP	// caller sp after CALL
   612  #ifdef GOBUILDMODE_shared
   613  	SUBL	$16, (SP)	// return to CALL again
   614  #else
   615  	SUBL	$5, (SP)	// return to CALL again
   616  #endif
   617  	MOVL	0(DX), BX
   618  	JMP	BX	// but first run the deferred function
   619  
   620  // Save state of caller into g->sched.
   621  TEXT gosave<>(SB),NOSPLIT,$0
   622  	PUSHL	AX
   623  	PUSHL	BX
   624  	get_tls(BX)
   625  	MOVL	g(BX), BX
   626  	LEAL	arg+0(FP), AX
   627  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   628  	MOVL	-4(AX), AX
   629  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   630  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   631  	// Assert ctxt is zero. See func save.
   632  	MOVL	(g_sched+gobuf_ctxt)(BX), AX
   633  	TESTL	AX, AX
   634  	JZ	2(PC)
   635  	CALL	runtime·badctxt(SB)
   636  	POPL	BX
   637  	POPL	AX
   638  	RET
   639  
   640  // func asmcgocall(fn, arg unsafe.Pointer) int32
   641  // Call fn(arg) on the scheduler stack,
   642  // aligned appropriately for the gcc ABI.
   643  // See cgocall.go for more details.
   644  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   645  	MOVL	fn+0(FP), AX
   646  	MOVL	arg+4(FP), BX
   647  
   648  	MOVL	SP, DX
   649  
   650  	// Figure out if we need to switch to m->g0 stack.
   651  	// We get called to create new OS threads too, and those
   652  	// come in on the m->g0 stack already.
   653  	get_tls(CX)
   654  	MOVL	g(CX), BP
   655  	MOVL	g_m(BP), BP
   656  	MOVL	m_g0(BP), SI
   657  	MOVL	g(CX), DI
   658  	CMPL	SI, DI
   659  	JEQ	noswitch
   660  	CALL	gosave<>(SB)
   661  	get_tls(CX)
   662  	MOVL	SI, g(CX)
   663  	MOVL	(g_sched+gobuf_sp)(SI), SP
   664  
   665  noswitch:
   666  	// Now on a scheduling stack (a pthread-created stack).
   667  	SUBL	$32, SP
   668  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   669  	MOVL	DI, 8(SP)	// save g
   670  	MOVL	(g_stack+stack_hi)(DI), DI
   671  	SUBL	DX, DI
   672  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   673  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   674  	CALL	AX
   675  
   676  	// Restore registers, g, stack pointer.
   677  	get_tls(CX)
   678  	MOVL	8(SP), DI
   679  	MOVL	(g_stack+stack_hi)(DI), SI
   680  	SUBL	4(SP), SI
   681  	MOVL	DI, g(CX)
   682  	MOVL	SI, SP
   683  
   684  	MOVL	AX, ret+8(FP)
   685  	RET
   686  
   687  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   688  // Turn the fn into a Go func (by taking its address) and call
   689  // cgocallback_gofunc.
   690  TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
   691  	LEAL	fn+0(FP), AX
   692  	MOVL	AX, 0(SP)
   693  	MOVL	frame+4(FP), AX
   694  	MOVL	AX, 4(SP)
   695  	MOVL	framesize+8(FP), AX
   696  	MOVL	AX, 8(SP)
   697  	MOVL	ctxt+12(FP), AX
   698  	MOVL	AX, 12(SP)
   699  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   700  	CALL	AX
   701  	RET
   702  
   703  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   704  // See cgocall.go for more details.
   705  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
   706  	NO_LOCAL_POINTERS
   707  
   708  	// If g is nil, Go did not create the current thread.
   709  	// Call needm to obtain one for temporary use.
   710  	// In this case, we're running on the thread stack, so there's
   711  	// lots of space, but the linker doesn't know. Hide the call from
   712  	// the linker analysis by using an indirect call through AX.
   713  	get_tls(CX)
   714  #ifdef GOOS_windows
   715  	MOVL	$0, BP
   716  	CMPL	CX, $0
   717  	JEQ	2(PC) // TODO
   718  #endif
   719  	MOVL	g(CX), BP
   720  	CMPL	BP, $0
   721  	JEQ	needm
   722  	MOVL	g_m(BP), BP
   723  	MOVL	BP, DX // saved copy of oldm
   724  	JMP	havem
   725  needm:
   726  	MOVL	$0, 0(SP)
   727  	MOVL	$runtime·needm(SB), AX
   728  	CALL	AX
   729  	MOVL	0(SP), DX
   730  	get_tls(CX)
   731  	MOVL	g(CX), BP
   732  	MOVL	g_m(BP), BP
   733  
   734  	// Set m->sched.sp = SP, so that if a panic happens
   735  	// during the function we are about to execute, it will
   736  	// have a valid SP to run on the g0 stack.
   737  	// The next few lines (after the havem label)
   738  	// will save this SP onto the stack and then write
   739  	// the same SP back to m->sched.sp. That seems redundant,
   740  	// but if an unrecovered panic happens, unwindm will
   741  	// restore the g->sched.sp from the stack location
   742  	// and then systemstack will try to use it. If we don't set it here,
   743  	// that restored SP will be uninitialized (typically 0) and
   744  	// will not be usable.
   745  	MOVL	m_g0(BP), SI
   746  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   747  
   748  havem:
   749  	// Now there's a valid m, and we're running on its m->g0.
   750  	// Save current m->g0->sched.sp on stack and then set it to SP.
   751  	// Save current sp in m->g0->sched.sp in preparation for
   752  	// switch back to m->curg stack.
   753  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   754  	MOVL	m_g0(BP), SI
   755  	MOVL	(g_sched+gobuf_sp)(SI), AX
   756  	MOVL	AX, 0(SP)
   757  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   758  
   759  	// Switch to m->curg stack and call runtime.cgocallbackg.
   760  	// Because we are taking over the execution of m->curg
   761  	// but *not* resuming what had been running, we need to
   762  	// save that information (m->curg->sched) so we can restore it.
   763  	// We can restore m->curg->sched.sp easily, because calling
   764  	// runtime.cgocallbackg leaves SP unchanged upon return.
   765  	// To save m->curg->sched.pc, we push it onto the stack.
   766  	// This has the added benefit that it looks to the traceback
   767  	// routine like cgocallbackg is going to return to that
   768  	// PC (because the frame we allocate below has the same
   769  	// size as cgocallback_gofunc's frame declared above)
   770  	// so that the traceback will seamlessly trace back into
   771  	// the earlier calls.
   772  	//
   773  	// In the new goroutine, 4(SP) holds the saved oldm (DX) register.
   774  	// 8(SP) is unused.
   775  	MOVL	m_curg(BP), SI
   776  	MOVL	SI, g(CX)
   777  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   778  	MOVL	(g_sched+gobuf_pc)(SI), BP
   779  	MOVL	BP, -4(DI)
   780  	MOVL	ctxt+12(FP), CX
   781  	LEAL	-(4+12)(DI), SP
   782  	MOVL	DX, 4(SP)
   783  	MOVL	CX, 0(SP)
   784  	CALL	runtime·cgocallbackg(SB)
   785  	MOVL	4(SP), DX
   786  
   787  	// Restore g->sched (== m->curg->sched) from saved values.
   788  	get_tls(CX)
   789  	MOVL	g(CX), SI
   790  	MOVL	12(SP), BP
   791  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   792  	LEAL	(12+4)(SP), DI
   793  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   794  
   795  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   796  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   797  	// so we do not have to restore it.)
   798  	MOVL	g(CX), BP
   799  	MOVL	g_m(BP), BP
   800  	MOVL	m_g0(BP), SI
   801  	MOVL	SI, g(CX)
   802  	MOVL	(g_sched+gobuf_sp)(SI), SP
   803  	MOVL	0(SP), AX
   804  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   805  	
   806  	// If the m on entry was nil, we called needm above to borrow an m
   807  	// for the duration of the call. Since the call is over, return it with dropm.
   808  	CMPL	DX, $0
   809  	JNE 3(PC)
   810  	MOVL	$runtime·dropm(SB), AX
   811  	CALL	AX
   812  
   813  	// Done!
   814  	RET
   815  
   816  // void setg(G*); set g. for use by needm.
   817  TEXT runtime·setg(SB), NOSPLIT, $0-4
   818  	MOVL	gg+0(FP), BX
   819  #ifdef GOOS_windows
   820  	CMPL	BX, $0
   821  	JNE	settls
   822  	MOVL	$0, 0x14(FS)
   823  	RET
   824  settls:
   825  	MOVL	g_m(BX), AX
   826  	LEAL	m_tls(AX), AX
   827  	MOVL	AX, 0x14(FS)
   828  #endif
   829  	get_tls(CX)
   830  	MOVL	BX, g(CX)
   831  	RET
   832  
   833  // void setg_gcc(G*); set g. for use by gcc
   834  TEXT setg_gcc<>(SB), NOSPLIT, $0
   835  	get_tls(AX)
   836  	MOVL	gg+0(FP), DX
   837  	MOVL	DX, g(AX)
   838  	RET
   839  
   840  // check that SP is in range [g->stack.lo, g->stack.hi)
   841  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   842  	get_tls(CX)
   843  	MOVL	g(CX), AX
   844  	CMPL	(g_stack+stack_hi)(AX), SP
   845  	JHI	2(PC)
   846  	INT	$3
   847  	CMPL	SP, (g_stack+stack_lo)(AX)
   848  	JHI	2(PC)
   849  	INT	$3
   850  	RET
   851  
   852  TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
   853  	MOVL	argp+0(FP),AX		// addr of first arg
   854  	MOVL	-4(AX),AX		// get calling pc
   855  	MOVL	AX, ret+4(FP)
   856  	RET
   857  
   858  // func cputicks() int64
   859  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   860  	CMPB	runtime·support_sse2(SB), $1
   861  	JNE	done
   862  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   863  	JNE	mfence
   864  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   865  	JMP	done
   866  mfence:
   867  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   868  done:
   869  	RDTSC
   870  	MOVL	AX, ret_lo+0(FP)
   871  	MOVL	DX, ret_hi+4(FP)
   872  	RET
   873  
   874  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   875  	// set up ldt 7 to point at m0.tls
   876  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   877  	// the entry number is just a hint.  setldt will set up GS with what it used.
   878  	MOVL	$7, 0(SP)
   879  	LEAL	runtime·m0+m_tls(SB), AX
   880  	MOVL	AX, 4(SP)
   881  	MOVL	$32, 8(SP)	// sizeof(tls array)
   882  	CALL	runtime·setldt(SB)
   883  	RET
   884  
   885  TEXT runtime·emptyfunc(SB),0,$0-0
   886  	RET
   887  
   888  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   889  // redirects to memhash(p, h, size) using the size
   890  // stored in the closure.
   891  TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
   892  	GO_ARGS
   893  	NO_LOCAL_POINTERS
   894  	MOVL	p+0(FP), AX
   895  	MOVL	h+4(FP), BX
   896  	MOVL	4(DX), CX
   897  	MOVL	AX, 0(SP)
   898  	MOVL	BX, 4(SP)
   899  	MOVL	CX, 8(SP)
   900  	CALL	runtime·memhash(SB)
   901  	MOVL	12(SP), AX
   902  	MOVL	AX, ret+8(FP)
   903  	RET
   904  
   905  // hash function using AES hardware instructions
   906  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   907  	MOVL	p+0(FP), AX	// ptr to data
   908  	MOVL	s+8(FP), BX	// size
   909  	LEAL	ret+12(FP), DX
   910  	JMP	runtime·aeshashbody(SB)
   911  
   912  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   913  	MOVL	p+0(FP), AX	// ptr to string object
   914  	MOVL	4(AX), BX	// length of string
   915  	MOVL	(AX), AX	// string data
   916  	LEAL	ret+8(FP), DX
   917  	JMP	runtime·aeshashbody(SB)
   918  
   919  // AX: data
   920  // BX: length
   921  // DX: address to put return value
   922  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   923  	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
   924  	PINSRW	$4, BX, X0	            // 16 bits of length
   925  	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
   926  	MOVO	X0, X1                      // save unscrambled seed
   927  	PXOR	runtime·aeskeysched(SB), X0 // xor in per-process seed
   928  	AESENC	X0, X0                      // scramble seed
   929  
   930  	CMPL	BX, $16
   931  	JB	aes0to15
   932  	JE	aes16
   933  	CMPL	BX, $32
   934  	JBE	aes17to32
   935  	CMPL	BX, $64
   936  	JBE	aes33to64
   937  	JMP	aes65plus
   938  	
   939  aes0to15:
   940  	TESTL	BX, BX
   941  	JE	aes0
   942  
   943  	ADDL	$16, AX
   944  	TESTW	$0xff0, AX
   945  	JE	endofpage
   946  
   947  	// 16 bytes loaded at this address won't cross
   948  	// a page boundary, so we can load it directly.
   949  	MOVOU	-16(AX), X1
   950  	ADDL	BX, BX
   951  	PAND	masks<>(SB)(BX*8), X1
   952  
   953  final1:	
   954  	AESENC	X0, X1  // scramble input, xor in seed
   955  	AESENC	X1, X1  // scramble combo 2 times
   956  	AESENC	X1, X1
   957  	MOVL	X1, (DX)
   958  	RET
   959  
   960  endofpage:
   961  	// address ends in 1111xxxx. Might be up against
   962  	// a page boundary, so load ending at last byte.
   963  	// Then shift bytes down using pshufb.
   964  	MOVOU	-32(AX)(BX*1), X1
   965  	ADDL	BX, BX
   966  	PSHUFB	shifts<>(SB)(BX*8), X1
   967  	JMP	final1
   968  
   969  aes0:
   970  	// Return scrambled input seed
   971  	AESENC	X0, X0
   972  	MOVL	X0, (DX)
   973  	RET
   974  
   975  aes16:
   976  	MOVOU	(AX), X1
   977  	JMP	final1
   978  
   979  aes17to32:
   980  	// make second starting seed
   981  	PXOR	runtime·aeskeysched+16(SB), X1
   982  	AESENC	X1, X1
   983  	
   984  	// load data to be hashed
   985  	MOVOU	(AX), X2
   986  	MOVOU	-16(AX)(BX*1), X3
   987  
   988  	// scramble 3 times
   989  	AESENC	X0, X2
   990  	AESENC	X1, X3
   991  	AESENC	X2, X2
   992  	AESENC	X3, X3
   993  	AESENC	X2, X2
   994  	AESENC	X3, X3
   995  
   996  	// combine results
   997  	PXOR	X3, X2
   998  	MOVL	X2, (DX)
   999  	RET
  1000  
  1001  aes33to64:
  1002  	// make 3 more starting seeds
  1003  	MOVO	X1, X2
  1004  	MOVO	X1, X3
  1005  	PXOR	runtime·aeskeysched+16(SB), X1
  1006  	PXOR	runtime·aeskeysched+32(SB), X2
  1007  	PXOR	runtime·aeskeysched+48(SB), X3
  1008  	AESENC	X1, X1
  1009  	AESENC	X2, X2
  1010  	AESENC	X3, X3
  1011  	
  1012  	MOVOU	(AX), X4
  1013  	MOVOU	16(AX), X5
  1014  	MOVOU	-32(AX)(BX*1), X6
  1015  	MOVOU	-16(AX)(BX*1), X7
  1016  	
  1017  	AESENC	X0, X4
  1018  	AESENC	X1, X5
  1019  	AESENC	X2, X6
  1020  	AESENC	X3, X7
  1021  	
  1022  	AESENC	X4, X4
  1023  	AESENC	X5, X5
  1024  	AESENC	X6, X6
  1025  	AESENC	X7, X7
  1026  	
  1027  	AESENC	X4, X4
  1028  	AESENC	X5, X5
  1029  	AESENC	X6, X6
  1030  	AESENC	X7, X7
  1031  
  1032  	PXOR	X6, X4
  1033  	PXOR	X7, X5
  1034  	PXOR	X5, X4
  1035  	MOVL	X4, (DX)
  1036  	RET
  1037  
  1038  aes65plus:
  1039  	// make 3 more starting seeds
  1040  	MOVO	X1, X2
  1041  	MOVO	X1, X3
  1042  	PXOR	runtime·aeskeysched+16(SB), X1
  1043  	PXOR	runtime·aeskeysched+32(SB), X2
  1044  	PXOR	runtime·aeskeysched+48(SB), X3
  1045  	AESENC	X1, X1
  1046  	AESENC	X2, X2
  1047  	AESENC	X3, X3
  1048  	
  1049  	// start with last (possibly overlapping) block
  1050  	MOVOU	-64(AX)(BX*1), X4
  1051  	MOVOU	-48(AX)(BX*1), X5
  1052  	MOVOU	-32(AX)(BX*1), X6
  1053  	MOVOU	-16(AX)(BX*1), X7
  1054  
  1055  	// scramble state once
  1056  	AESENC	X0, X4
  1057  	AESENC	X1, X5
  1058  	AESENC	X2, X6
  1059  	AESENC	X3, X7
  1060  
  1061  	// compute number of remaining 64-byte blocks
  1062  	DECL	BX
  1063  	SHRL	$6, BX
  1064  	
  1065  aesloop:
  1066  	// scramble state, xor in a block
  1067  	MOVOU	(AX), X0
  1068  	MOVOU	16(AX), X1
  1069  	MOVOU	32(AX), X2
  1070  	MOVOU	48(AX), X3
  1071  	AESENC	X0, X4
  1072  	AESENC	X1, X5
  1073  	AESENC	X2, X6
  1074  	AESENC	X3, X7
  1075  
  1076  	// scramble state
  1077  	AESENC	X4, X4
  1078  	AESENC	X5, X5
  1079  	AESENC	X6, X6
  1080  	AESENC	X7, X7
  1081  
  1082  	ADDL	$64, AX
  1083  	DECL	BX
  1084  	JNE	aesloop
  1085  
  1086  	// 2 more scrambles to finish
  1087  	AESENC	X4, X4
  1088  	AESENC	X5, X5
  1089  	AESENC	X6, X6
  1090  	AESENC	X7, X7
  1091  	
  1092  	AESENC	X4, X4
  1093  	AESENC	X5, X5
  1094  	AESENC	X6, X6
  1095  	AESENC	X7, X7
  1096  
  1097  	PXOR	X6, X4
  1098  	PXOR	X7, X5
  1099  	PXOR	X5, X4
  1100  	MOVL	X4, (DX)
  1101  	RET
  1102  
  1103  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1104  	MOVL	p+0(FP), AX	// ptr to data
  1105  	MOVL	h+4(FP), X0	// seed
  1106  	PINSRD	$1, (AX), X0	// data
  1107  	AESENC	runtime·aeskeysched+0(SB), X0
  1108  	AESENC	runtime·aeskeysched+16(SB), X0
  1109  	AESENC	runtime·aeskeysched+32(SB), X0
  1110  	MOVL	X0, ret+8(FP)
  1111  	RET
  1112  
  1113  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1114  	MOVL	p+0(FP), AX	// ptr to data
  1115  	MOVQ	(AX), X0	// data
  1116  	PINSRD	$2, h+4(FP), X0	// seed
  1117  	AESENC	runtime·aeskeysched+0(SB), X0
  1118  	AESENC	runtime·aeskeysched+16(SB), X0
  1119  	AESENC	runtime·aeskeysched+32(SB), X0
  1120  	MOVL	X0, ret+8(FP)
  1121  	RET
  1122  
  1123  // simple mask to get rid of data in the high part of the register.
  1124  DATA masks<>+0x00(SB)/4, $0x00000000
  1125  DATA masks<>+0x04(SB)/4, $0x00000000
  1126  DATA masks<>+0x08(SB)/4, $0x00000000
  1127  DATA masks<>+0x0c(SB)/4, $0x00000000
  1128  	
  1129  DATA masks<>+0x10(SB)/4, $0x000000ff
  1130  DATA masks<>+0x14(SB)/4, $0x00000000
  1131  DATA masks<>+0x18(SB)/4, $0x00000000
  1132  DATA masks<>+0x1c(SB)/4, $0x00000000
  1133  	
  1134  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1135  DATA masks<>+0x24(SB)/4, $0x00000000
  1136  DATA masks<>+0x28(SB)/4, $0x00000000
  1137  DATA masks<>+0x2c(SB)/4, $0x00000000
  1138  	
  1139  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1140  DATA masks<>+0x34(SB)/4, $0x00000000
  1141  DATA masks<>+0x38(SB)/4, $0x00000000
  1142  DATA masks<>+0x3c(SB)/4, $0x00000000
  1143  	
  1144  DATA masks<>+0x40(SB)/4, $0xffffffff
  1145  DATA masks<>+0x44(SB)/4, $0x00000000
  1146  DATA masks<>+0x48(SB)/4, $0x00000000
  1147  DATA masks<>+0x4c(SB)/4, $0x00000000
  1148  	
  1149  DATA masks<>+0x50(SB)/4, $0xffffffff
  1150  DATA masks<>+0x54(SB)/4, $0x000000ff
  1151  DATA masks<>+0x58(SB)/4, $0x00000000
  1152  DATA masks<>+0x5c(SB)/4, $0x00000000
  1153  	
  1154  DATA masks<>+0x60(SB)/4, $0xffffffff
  1155  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1156  DATA masks<>+0x68(SB)/4, $0x00000000
  1157  DATA masks<>+0x6c(SB)/4, $0x00000000
  1158  	
  1159  DATA masks<>+0x70(SB)/4, $0xffffffff
  1160  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1161  DATA masks<>+0x78(SB)/4, $0x00000000
  1162  DATA masks<>+0x7c(SB)/4, $0x00000000
  1163  	
  1164  DATA masks<>+0x80(SB)/4, $0xffffffff
  1165  DATA masks<>+0x84(SB)/4, $0xffffffff
  1166  DATA masks<>+0x88(SB)/4, $0x00000000
  1167  DATA masks<>+0x8c(SB)/4, $0x00000000
  1168  	
  1169  DATA masks<>+0x90(SB)/4, $0xffffffff
  1170  DATA masks<>+0x94(SB)/4, $0xffffffff
  1171  DATA masks<>+0x98(SB)/4, $0x000000ff
  1172  DATA masks<>+0x9c(SB)/4, $0x00000000
  1173  	
  1174  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1175  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1176  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1177  DATA masks<>+0xac(SB)/4, $0x00000000
  1178  	
  1179  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1180  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1181  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1182  DATA masks<>+0xbc(SB)/4, $0x00000000
  1183  	
  1184  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1185  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1186  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1187  DATA masks<>+0xcc(SB)/4, $0x00000000
  1188  	
  1189  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1190  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1191  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1192  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1193  	
  1194  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1195  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1196  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1197  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1198  	
  1199  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1200  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1201  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1202  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1203  
  1204  GLOBL masks<>(SB),RODATA,$256
  1205  
  1206  // these are arguments to pshufb. They move data down from
  1207  // the high bytes of the register to the low bytes of the register.
  1208  // index is how many bytes to move.
  1209  DATA shifts<>+0x00(SB)/4, $0x00000000
  1210  DATA shifts<>+0x04(SB)/4, $0x00000000
  1211  DATA shifts<>+0x08(SB)/4, $0x00000000
  1212  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1213  	
  1214  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1215  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1216  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1217  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1218  	
  1219  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1220  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1221  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1222  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1223  	
  1224  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1225  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1226  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1227  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1228  	
  1229  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1230  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1231  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1232  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1233  	
  1234  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1235  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1236  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1237  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1238  	
  1239  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1240  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1241  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1242  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1243  	
  1244  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1245  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1246  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1247  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1248  	
  1249  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1250  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1251  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1252  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1253  	
  1254  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1255  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1256  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1257  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1258  	
  1259  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1260  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1261  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1262  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1263  	
  1264  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1265  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1266  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1267  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1268  	
  1269  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1270  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1271  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1272  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1273  	
  1274  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1275  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1276  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1277  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1278  	
  1279  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1280  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1281  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1282  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1283  	
  1284  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1285  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1286  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1287  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1288  
  1289  GLOBL shifts<>(SB),RODATA,$256
  1290  
  1291  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1292  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1293  	MOVL	$masks<>(SB), AX
  1294  	MOVL	$shifts<>(SB), BX
  1295  	ORL	BX, AX
  1296  	TESTL	$15, AX
  1297  	SETEQ	ret+0(FP)
  1298  	RET
  1299  
  1300  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1301  TEXT runtime·memequal(SB),NOSPLIT,$0-13
  1302  	MOVL	a+0(FP), SI
  1303  	MOVL	b+4(FP), DI
  1304  	CMPL	SI, DI
  1305  	JEQ	eq
  1306  	MOVL	size+8(FP), BX
  1307  	LEAL	ret+12(FP), AX
  1308  	JMP	runtime·memeqbody(SB)
  1309  eq:
  1310  	MOVB    $1, ret+12(FP)
  1311  	RET
  1312  
  1313  // memequal_varlen(a, b unsafe.Pointer) bool
  1314  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1315  	MOVL    a+0(FP), SI
  1316  	MOVL    b+4(FP), DI
  1317  	CMPL    SI, DI
  1318  	JEQ     eq
  1319  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1320  	LEAL	ret+8(FP), AX
  1321  	JMP	runtime·memeqbody(SB)
  1322  eq:
  1323  	MOVB    $1, ret+8(FP)
  1324  	RET
  1325  
  1326  // eqstring tests whether two strings are equal.
  1327  // The compiler guarantees that strings passed
  1328  // to eqstring have equal length.
  1329  // See runtime_test.go:eqstring_generic for
  1330  // equivalent Go code.
  1331  TEXT runtime·eqstring(SB),NOSPLIT,$0-17
  1332  	MOVL	s1_base+0(FP), SI
  1333  	MOVL	s2_base+8(FP), DI
  1334  	CMPL	SI, DI
  1335  	JEQ	same
  1336  	MOVL	s1_len+4(FP), BX
  1337  	LEAL	ret+16(FP), AX
  1338  	JMP	runtime·memeqbody(SB)
  1339  same:
  1340  	MOVB	$1, ret+16(FP)
  1341  	RET
  1342  
  1343  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1344  	MOVL	a_len+4(FP), BX
  1345  	MOVL	b_len+16(FP), CX
  1346  	CMPL	BX, CX
  1347  	JNE	eqret
  1348  	MOVL	a+0(FP), SI
  1349  	MOVL	b+12(FP), DI
  1350  	LEAL	ret+24(FP), AX
  1351  	JMP	runtime·memeqbody(SB)
  1352  eqret:
  1353  	MOVB	$0, ret+24(FP)
  1354  	RET
  1355  
  1356  // a in SI
  1357  // b in DI
  1358  // count in BX
  1359  // address of result byte in AX
  1360  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1361  	CMPL	BX, $4
  1362  	JB	small
  1363  
  1364  	// 64 bytes at a time using xmm registers
  1365  hugeloop:
  1366  	CMPL	BX, $64
  1367  	JB	bigloop
  1368  	CMPB	runtime·support_sse2(SB), $1
  1369  	JNE	bigloop
  1370  	MOVOU	(SI), X0
  1371  	MOVOU	(DI), X1
  1372  	MOVOU	16(SI), X2
  1373  	MOVOU	16(DI), X3
  1374  	MOVOU	32(SI), X4
  1375  	MOVOU	32(DI), X5
  1376  	MOVOU	48(SI), X6
  1377  	MOVOU	48(DI), X7
  1378  	PCMPEQB	X1, X0
  1379  	PCMPEQB	X3, X2
  1380  	PCMPEQB	X5, X4
  1381  	PCMPEQB	X7, X6
  1382  	PAND	X2, X0
  1383  	PAND	X6, X4
  1384  	PAND	X4, X0
  1385  	PMOVMSKB X0, DX
  1386  	ADDL	$64, SI
  1387  	ADDL	$64, DI
  1388  	SUBL	$64, BX
  1389  	CMPL	DX, $0xffff
  1390  	JEQ	hugeloop
  1391  	MOVB	$0, (AX)
  1392  	RET
  1393  
  1394  	// 4 bytes at a time using 32-bit register
  1395  bigloop:
  1396  	CMPL	BX, $4
  1397  	JBE	leftover
  1398  	MOVL	(SI), CX
  1399  	MOVL	(DI), DX
  1400  	ADDL	$4, SI
  1401  	ADDL	$4, DI
  1402  	SUBL	$4, BX
  1403  	CMPL	CX, DX
  1404  	JEQ	bigloop
  1405  	MOVB	$0, (AX)
  1406  	RET
  1407  
  1408  	// remaining 0-4 bytes
  1409  leftover:
  1410  	MOVL	-4(SI)(BX*1), CX
  1411  	MOVL	-4(DI)(BX*1), DX
  1412  	CMPL	CX, DX
  1413  	SETEQ	(AX)
  1414  	RET
  1415  
  1416  small:
  1417  	CMPL	BX, $0
  1418  	JEQ	equal
  1419  
  1420  	LEAL	0(BX*8), CX
  1421  	NEGL	CX
  1422  
  1423  	MOVL	SI, DX
  1424  	CMPB	DX, $0xfc
  1425  	JA	si_high
  1426  
  1427  	// load at SI won't cross a page boundary.
  1428  	MOVL	(SI), SI
  1429  	JMP	si_finish
  1430  si_high:
  1431  	// address ends in 111111xx. Load up to bytes we want, move to correct position.
  1432  	MOVL	-4(SI)(BX*1), SI
  1433  	SHRL	CX, SI
  1434  si_finish:
  1435  
  1436  	// same for DI.
  1437  	MOVL	DI, DX
  1438  	CMPB	DX, $0xfc
  1439  	JA	di_high
  1440  	MOVL	(DI), DI
  1441  	JMP	di_finish
  1442  di_high:
  1443  	MOVL	-4(DI)(BX*1), DI
  1444  	SHRL	CX, DI
  1445  di_finish:
  1446  
  1447  	SUBL	SI, DI
  1448  	SHLL	CX, DI
  1449  equal:
  1450  	SETEQ	(AX)
  1451  	RET
  1452  
  1453  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1454  	MOVL	s1_base+0(FP), SI
  1455  	MOVL	s1_len+4(FP), BX
  1456  	MOVL	s2_base+8(FP), DI
  1457  	MOVL	s2_len+12(FP), DX
  1458  	LEAL	ret+16(FP), AX
  1459  	JMP	runtime·cmpbody(SB)
  1460  
  1461  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1462  	MOVL	s1+0(FP), SI
  1463  	MOVL	s1+4(FP), BX
  1464  	MOVL	s2+12(FP), DI
  1465  	MOVL	s2+16(FP), DX
  1466  	LEAL	ret+24(FP), AX
  1467  	JMP	runtime·cmpbody(SB)
  1468  
  1469  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1470  	MOVL	s+0(FP), SI
  1471  	MOVL	s_len+4(FP), CX
  1472  	MOVB	c+12(FP), AL
  1473  	MOVL	SI, DI
  1474  	CLD; REPN; SCASB
  1475  	JZ 3(PC)
  1476  	MOVL	$-1, ret+16(FP)
  1477  	RET
  1478  	SUBL	SI, DI
  1479  	SUBL	$1, DI
  1480  	MOVL	DI, ret+16(FP)
  1481  	RET
  1482  
  1483  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1484  	MOVL	s+0(FP), SI
  1485  	MOVL	s_len+4(FP), CX
  1486  	MOVB	c+8(FP), AL
  1487  	MOVL	SI, DI
  1488  	CLD; REPN; SCASB
  1489  	JZ 3(PC)
  1490  	MOVL	$-1, ret+12(FP)
  1491  	RET
  1492  	SUBL	SI, DI
  1493  	SUBL	$1, DI
  1494  	MOVL	DI, ret+12(FP)
  1495  	RET
  1496  
  1497  // input:
  1498  //   SI = a
  1499  //   DI = b
  1500  //   BX = alen
  1501  //   DX = blen
  1502  //   AX = address of return word (set to 1/0/-1)
  1503  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1504  	MOVL	DX, BP
  1505  	SUBL	BX, DX // DX = blen-alen
  1506  	JLE	2(PC)
  1507  	MOVL	BX, BP // BP = min(alen, blen)
  1508  	CMPL	SI, DI
  1509  	JEQ	allsame
  1510  	CMPL	BP, $4
  1511  	JB	small
  1512  	CMPB	runtime·support_sse2(SB), $1
  1513  	JNE	mediumloop
  1514  largeloop:
  1515  	CMPL	BP, $16
  1516  	JB	mediumloop
  1517  	MOVOU	(SI), X0
  1518  	MOVOU	(DI), X1
  1519  	PCMPEQB X0, X1
  1520  	PMOVMSKB X1, BX
  1521  	XORL	$0xffff, BX	// convert EQ to NE
  1522  	JNE	diff16	// branch if at least one byte is not equal
  1523  	ADDL	$16, SI
  1524  	ADDL	$16, DI
  1525  	SUBL	$16, BP
  1526  	JMP	largeloop
  1527  
  1528  diff16:
  1529  	BSFL	BX, BX	// index of first byte that differs
  1530  	XORL	DX, DX
  1531  	MOVB	(SI)(BX*1), CX
  1532  	CMPB	CX, (DI)(BX*1)
  1533  	SETHI	DX
  1534  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1535  	MOVL	DX, (AX)
  1536  	RET
  1537  
  1538  mediumloop:
  1539  	CMPL	BP, $4
  1540  	JBE	_0through4
  1541  	MOVL	(SI), BX
  1542  	MOVL	(DI), CX
  1543  	CMPL	BX, CX
  1544  	JNE	diff4
  1545  	ADDL	$4, SI
  1546  	ADDL	$4, DI
  1547  	SUBL	$4, BP
  1548  	JMP	mediumloop
  1549  
  1550  _0through4:
  1551  	MOVL	-4(SI)(BP*1), BX
  1552  	MOVL	-4(DI)(BP*1), CX
  1553  	CMPL	BX, CX
  1554  	JEQ	allsame
  1555  
  1556  diff4:
  1557  	BSWAPL	BX	// reverse order of bytes
  1558  	BSWAPL	CX
  1559  	XORL	BX, CX	// find bit differences
  1560  	BSRL	CX, CX	// index of highest bit difference
  1561  	SHRL	CX, BX	// move a's bit to bottom
  1562  	ANDL	$1, BX	// mask bit
  1563  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1564  	MOVL	BX, (AX)
  1565  	RET
  1566  
  1567  	// 0-3 bytes in common
  1568  small:
  1569  	LEAL	(BP*8), CX
  1570  	NEGL	CX
  1571  	JEQ	allsame
  1572  
  1573  	// load si
  1574  	CMPB	SI, $0xfc
  1575  	JA	si_high
  1576  	MOVL	(SI), SI
  1577  	JMP	si_finish
  1578  si_high:
  1579  	MOVL	-4(SI)(BP*1), SI
  1580  	SHRL	CX, SI
  1581  si_finish:
  1582  	SHLL	CX, SI
  1583  
  1584  	// same for di
  1585  	CMPB	DI, $0xfc
  1586  	JA	di_high
  1587  	MOVL	(DI), DI
  1588  	JMP	di_finish
  1589  di_high:
  1590  	MOVL	-4(DI)(BP*1), DI
  1591  	SHRL	CX, DI
  1592  di_finish:
  1593  	SHLL	CX, DI
  1594  
  1595  	BSWAPL	SI	// reverse order of bytes
  1596  	BSWAPL	DI
  1597  	XORL	SI, DI	// find bit differences
  1598  	JEQ	allsame
  1599  	BSRL	DI, CX	// index of highest bit difference
  1600  	SHRL	CX, SI	// move a's bit to bottom
  1601  	ANDL	$1, SI	// mask bit
  1602  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1603  	MOVL	BX, (AX)
  1604  	RET
  1605  
  1606  	// all the bytes in common are the same, so we just need
  1607  	// to compare the lengths.
  1608  allsame:
  1609  	XORL	BX, BX
  1610  	XORL	CX, CX
  1611  	TESTL	DX, DX
  1612  	SETLT	BX	// 1 if alen > blen
  1613  	SETEQ	CX	// 1 if alen == blen
  1614  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1615  	MOVL	BX, (AX)
  1616  	RET
  1617  
  1618  TEXT runtime·return0(SB), NOSPLIT, $0
  1619  	MOVL	$0, AX
  1620  	RET
  1621  
  1622  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1623  // Must obey the gcc calling convention.
  1624  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1625  	get_tls(CX)
  1626  	MOVL	g(CX), AX
  1627  	MOVL	g_m(AX), AX
  1628  	MOVL	m_curg(AX), AX
  1629  	MOVL	(g_stack+stack_hi)(AX), AX
  1630  	RET
  1631  
  1632  // The top-most function running on a goroutine
  1633  // returns to goexit+PCQuantum.
  1634  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1635  	BYTE	$0x90	// NOP
  1636  	CALL	runtime·goexit1(SB)	// does not return
  1637  	// traceback from goexit1 must hit code range of goexit
  1638  	BYTE	$0x90	// NOP
  1639  
  1640  // Add a module's moduledata to the linked list of moduledata objects. This
  1641  // is called from .init_array by a function generated in the linker and so
  1642  // follows the platform ABI wrt register preservation -- it only touches AX,
  1643  // CX (implicitly) and DX, but it does not follow the ABI wrt arguments:
  1644  // instead the pointer to the moduledata is passed in AX.
  1645  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1646         MOVL    runtime·lastmoduledatap(SB), DX
  1647         MOVL    AX, moduledata_next(DX)
  1648         MOVL    AX, runtime·lastmoduledatap(SB)
  1649         RET
  1650  
  1651  TEXT runtime·uint32tofloat64(SB),NOSPLIT,$8-12
  1652  	MOVL	a+0(FP), AX
  1653  	MOVL	AX, 0(SP)
  1654  	MOVL	$0, 4(SP)
  1655  	FMOVV	0(SP), F0
  1656  	FMOVDP	F0, ret+4(FP)
  1657  	RET
  1658  
  1659  TEXT runtime·float64touint32(SB),NOSPLIT,$12-12
  1660  	FMOVD	a+0(FP), F0
  1661  	FSTCW	0(SP)
  1662  	FLDCW	runtime·controlWord64trunc(SB)
  1663  	FMOVVP	F0, 4(SP)
  1664  	FLDCW	0(SP)
  1665  	MOVL	4(SP), AX
  1666  	MOVL	AX, ret+8(FP)
  1667  	RET