github.com/stingnevermore/go@v0.0.0-20180120041312-3810f5bfed72/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  // _rt0_386 is common startup code for most 386 systems when using
    11  // internal linking. This is the entry point for the program from the
    12  // kernel for an ordinary -buildmode=exe program. The stack holds the
    13  // number of arguments and the C-style argv.
    14  TEXT _rt0_386(SB),NOSPLIT,$8
    15  	MOVL	8(SP), AX	// argc
    16  	LEAL	12(SP), BX	// argv
    17  	MOVL	AX, 0(SP)
    18  	MOVL	BX, 4(SP)
    19  	JMP	runtime·rt0_go(SB)
    20  
    21  // _rt0_386_lib is common startup code for most 386 systems when
    22  // using -buildmode=c-archive or -buildmode=c-shared. The linker will
    23  // arrange to invoke this function as a global constructor (for
    24  // c-archive) or when the shared library is loaded (for c-shared).
    25  // We expect argc and argv to be passed on the stack following the
    26  // usual C ABI.
    27  TEXT _rt0_386_lib(SB),NOSPLIT,$0
    28  	PUSHL	BP
    29  	MOVL	SP, BP
    30  	PUSHL	BX
    31  	PUSHL	SI
    32  	PUSHL	DI
    33  
    34  	MOVL	8(BP), AX
    35  	MOVL	AX, _rt0_386_lib_argc<>(SB)
    36  	MOVL	12(BP), AX
    37  	MOVL	AX, _rt0_386_lib_argv<>(SB)
    38  
    39  	// Synchronous initialization.
    40  	CALL	runtime·libpreinit(SB)
    41  
    42  	SUBL	$8, SP
    43  
    44  	// Create a new thread to do the runtime initialization.
    45  	MOVL	_cgo_sys_thread_create(SB), AX
    46  	TESTL	AX, AX
    47  	JZ	nocgo
    48  
    49  	// Align stack to call C function.
    50  	// We moved SP to BP above, but BP was clobbered by the libpreinit call.
    51  	MOVL	SP, BP
    52  	ANDL	$~15, SP
    53  
    54  	MOVL	$_rt0_386_lib_go(SB), BX
    55  	MOVL	BX, 0(SP)
    56  	MOVL	$0, 4(SP)
    57  
    58  	CALL	AX
    59  
    60  	MOVL	BP, SP
    61  
    62  	JMP	restore
    63  
    64  nocgo:
    65  	MOVL	$0x800000, 0(SP)                    // stacksize = 8192KB
    66  	MOVL	$_rt0_386_lib_go(SB), AX
    67  	MOVL	AX, 4(SP)                           // fn
    68  	CALL	runtime·newosproc0(SB)
    69  
    70  restore:
    71  	ADDL	$8, SP
    72  	POPL	DI
    73  	POPL	SI
    74  	POPL	BX
    75  	POPL	BP
    76  	RET
    77  
    78  // _rt0_386_lib_go initializes the Go runtime.
    79  // This is started in a separate thread by _rt0_386_lib.
    80  TEXT _rt0_386_lib_go(SB),NOSPLIT,$8
    81  	MOVL	_rt0_386_lib_argc<>(SB), AX
    82  	MOVL	AX, 0(SP)
    83  	MOVL	_rt0_386_lib_argv<>(SB), AX
    84  	MOVL	AX, 4(SP)
    85  	JMP	runtime·rt0_go(SB)
    86  
    87  DATA _rt0_386_lib_argc<>(SB)/4, $0
    88  GLOBL _rt0_386_lib_argc<>(SB),NOPTR, $4
    89  DATA _rt0_386_lib_argv<>(SB)/4, $0
    90  GLOBL _rt0_386_lib_argv<>(SB),NOPTR, $4
    91  
    92  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    93  	// Copy arguments forward on an even stack.
    94  	// Users of this function jump to it, they don't call it.
    95  	MOVL	0(SP), AX
    96  	MOVL	4(SP), BX
    97  	SUBL	$128, SP		// plenty of scratch
    98  	ANDL	$~15, SP
    99  	MOVL	AX, 120(SP)		// save argc, argv away
   100  	MOVL	BX, 124(SP)
   101  
   102  	// set default stack bounds.
   103  	// _cgo_init may update stackguard.
   104  	MOVL	$runtime·g0(SB), BP
   105  	LEAL	(-64*1024+104)(SP), BX
   106  	MOVL	BX, g_stackguard0(BP)
   107  	MOVL	BX, g_stackguard1(BP)
   108  	MOVL	BX, (g_stack+stack_lo)(BP)
   109  	MOVL	SP, (g_stack+stack_hi)(BP)
   110  	
   111  	// find out information about the processor we're on
   112  #ifdef GOOS_nacl // NaCl doesn't like PUSHFL/POPFL
   113  	JMP 	has_cpuid
   114  #else
   115  	// first see if CPUID instruction is supported.
   116  	PUSHFL
   117  	PUSHFL
   118  	XORL	$(1<<21), 0(SP) // flip ID bit
   119  	POPFL
   120  	PUSHFL
   121  	POPL	AX
   122  	XORL	0(SP), AX
   123  	POPFL	// restore EFLAGS
   124  	TESTL	$(1<<21), AX
   125  	JNE 	has_cpuid
   126  #endif
   127  
   128  bad_proc: // show that the program requires MMX.
   129  	MOVL	$2, 0(SP)
   130  	MOVL	$bad_proc_msg<>(SB), 4(SP)
   131  	MOVL	$0x3d, 8(SP)
   132  	CALL	runtime·write(SB)
   133  	MOVL	$1, 0(SP)
   134  	CALL	runtime·exit(SB)
   135  	INT	$3
   136  
   137  has_cpuid:
   138  	MOVL	$0, AX
   139  	CPUID
   140  	MOVL	AX, SI
   141  	CMPL	AX, $0
   142  	JE	nocpuinfo
   143  
   144  	// Figure out how to serialize RDTSC.
   145  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
   146  	// Don't know about the rest, so let's do MFENCE.
   147  	CMPL	BX, $0x756E6547  // "Genu"
   148  	JNE	notintel
   149  	CMPL	DX, $0x49656E69  // "ineI"
   150  	JNE	notintel
   151  	CMPL	CX, $0x6C65746E  // "ntel"
   152  	JNE	notintel
   153  	MOVB	$1, runtime·isIntel(SB)
   154  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
   155  notintel:
   156  
   157  	// Load EAX=1 cpuid flags
   158  	MOVL	$1, AX
   159  	CPUID
   160  	MOVL	CX, DI // Move to global variable clobbers CX when generating PIC
   161  	MOVL	AX, runtime·processorVersionInfo(SB)
   162  
   163  	// Check for MMX support
   164  	TESTL	$(1<<23), DX // MMX
   165  	JZ	bad_proc
   166  
   167  	TESTL	$(1<<26), DX // SSE2
   168  	SETNE	runtime·support_sse2(SB)
   169  
   170  	TESTL	$(1<<9), DI // SSSE3
   171  	SETNE	runtime·support_ssse3(SB)
   172  
   173  	TESTL	$(1<<19), DI // SSE4.1
   174  	SETNE	runtime·support_sse41(SB)
   175  
   176  	TESTL	$(1<<20), DI // SSE4.2
   177  	SETNE	runtime·support_sse42(SB)
   178  
   179  	TESTL	$(1<<23), DI // POPCNT
   180  	SETNE	runtime·support_popcnt(SB)
   181  
   182  	TESTL	$(1<<25), DI // AES
   183  	SETNE	runtime·support_aes(SB)
   184  
   185  	TESTL	$(1<<27), DI // OSXSAVE
   186  	SETNE	runtime·support_osxsave(SB)
   187  
   188  	// If OS support for XMM and YMM is not present
   189  	// support_avx will be set back to false later.
   190  	TESTL	$(1<<28), DI // AVX
   191  	SETNE	runtime·support_avx(SB)
   192  
   193  eax7:
   194  	// Load EAX=7/ECX=0 cpuid flags
   195  	CMPL	SI, $7
   196  	JLT	osavx
   197  	MOVL	$7, AX
   198  	MOVL	$0, CX
   199  	CPUID
   200  
   201  	TESTL	$(1<<3), BX // BMI1
   202  	SETNE	runtime·support_bmi1(SB)
   203  
   204  	// If OS support for XMM and YMM is not present
   205  	// support_avx2 will be set back to false later.
   206  	TESTL	$(1<<5), BX
   207  	SETNE	runtime·support_avx2(SB)
   208  
   209  	TESTL	$(1<<8), BX // BMI2
   210  	SETNE	runtime·support_bmi2(SB)
   211  
   212  	TESTL	$(1<<9), BX // ERMS
   213  	SETNE	runtime·support_erms(SB)
   214  
   215  osavx:
   216  	// nacl does not support XGETBV to test
   217  	// for XMM and YMM OS support.
   218  #ifndef GOOS_nacl
   219  	CMPB	runtime·support_osxsave(SB), $1
   220  	JNE	noavx
   221  	MOVL	$0, CX
   222  	// For XGETBV, OSXSAVE bit is required and sufficient
   223  	XGETBV
   224  	ANDL	$6, AX
   225  	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
   226  	JE nocpuinfo
   227  #endif
   228  noavx:
   229  	MOVB $0, runtime·support_avx(SB)
   230  	MOVB $0, runtime·support_avx2(SB)
   231  
   232  nocpuinfo:
   233  	// if there is an _cgo_init, call it to let it
   234  	// initialize and to set up GS.  if not,
   235  	// we set up GS ourselves.
   236  	MOVL	_cgo_init(SB), AX
   237  	TESTL	AX, AX
   238  	JZ	needtls
   239  	MOVL	$setg_gcc<>(SB), BX
   240  	MOVL	BX, 4(SP)
   241  	MOVL	BP, 0(SP)
   242  	CALL	AX
   243  
   244  	// update stackguard after _cgo_init
   245  	MOVL	$runtime·g0(SB), CX
   246  	MOVL	(g_stack+stack_lo)(CX), AX
   247  	ADDL	$const__StackGuard, AX
   248  	MOVL	AX, g_stackguard0(CX)
   249  	MOVL	AX, g_stackguard1(CX)
   250  
   251  #ifndef GOOS_windows
   252  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
   253  	JMP ok
   254  #endif
   255  needtls:
   256  #ifdef GOOS_plan9
   257  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
   258  	JMP	ok
   259  #endif
   260  
   261  	// set up %gs
   262  	CALL	runtime·ldt0setup(SB)
   263  
   264  	// store through it, to make sure it works
   265  	get_tls(BX)
   266  	MOVL	$0x123, g(BX)
   267  	MOVL	runtime·m0+m_tls(SB), AX
   268  	CMPL	AX, $0x123
   269  	JEQ	ok
   270  	MOVL	AX, 0	// abort
   271  ok:
   272  	// set up m and g "registers"
   273  	get_tls(BX)
   274  	LEAL	runtime·g0(SB), DX
   275  	MOVL	DX, g(BX)
   276  	LEAL	runtime·m0(SB), AX
   277  
   278  	// save m->g0 = g0
   279  	MOVL	DX, m_g0(AX)
   280  	// save g0->m = m0
   281  	MOVL	AX, g_m(DX)
   282  
   283  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   284  
   285  	// convention is D is always cleared
   286  	CLD
   287  
   288  	CALL	runtime·check(SB)
   289  
   290  	// saved argc, argv
   291  	MOVL	120(SP), AX
   292  	MOVL	AX, 0(SP)
   293  	MOVL	124(SP), AX
   294  	MOVL	AX, 4(SP)
   295  	CALL	runtime·args(SB)
   296  	CALL	runtime·osinit(SB)
   297  	CALL	runtime·schedinit(SB)
   298  
   299  	// create a new goroutine to start program
   300  	PUSHL	$runtime·mainPC(SB)	// entry
   301  	PUSHL	$0	// arg size
   302  	CALL	runtime·newproc(SB)
   303  	POPL	AX
   304  	POPL	AX
   305  
   306  	// start this M
   307  	CALL	runtime·mstart(SB)
   308  
   309  	INT $3
   310  	RET
   311  
   312  DATA	bad_proc_msg<>+0x00(SB)/8, $"This pro"
   313  DATA	bad_proc_msg<>+0x08(SB)/8, $"gram can"
   314  DATA	bad_proc_msg<>+0x10(SB)/8, $" only be"
   315  DATA	bad_proc_msg<>+0x18(SB)/8, $" run on "
   316  DATA	bad_proc_msg<>+0x20(SB)/8, $"processo"
   317  DATA	bad_proc_msg<>+0x28(SB)/8, $"rs with "
   318  DATA	bad_proc_msg<>+0x30(SB)/8, $"MMX supp"
   319  DATA	bad_proc_msg<>+0x38(SB)/4, $"ort."
   320  DATA	bad_proc_msg<>+0x3c(SB)/1, $0xa
   321  GLOBL	bad_proc_msg<>(SB), RODATA, $0x3d
   322  
   323  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   324  GLOBL	runtime·mainPC(SB),RODATA,$4
   325  
   326  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   327  	INT $3
   328  	RET
   329  
   330  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   331  	// Linux and MinGW start the FPU in extended double precision.
   332  	// Other operating systems use double precision.
   333  	// Change to double precision to match them,
   334  	// and to match other hardware that only has double.
   335  	FLDCW	runtime·controlWord64(SB)
   336  	RET
   337  
   338  /*
   339   *  go-routine
   340   */
   341  
   342  // void gosave(Gobuf*)
   343  // save state in Gobuf; setjmp
   344  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   345  	MOVL	buf+0(FP), AX		// gobuf
   346  	LEAL	buf+0(FP), BX		// caller's SP
   347  	MOVL	BX, gobuf_sp(AX)
   348  	MOVL	0(SP), BX		// caller's PC
   349  	MOVL	BX, gobuf_pc(AX)
   350  	MOVL	$0, gobuf_ret(AX)
   351  	// Assert ctxt is zero. See func save.
   352  	MOVL	gobuf_ctxt(AX), BX
   353  	TESTL	BX, BX
   354  	JZ	2(PC)
   355  	CALL	runtime·badctxt(SB)
   356  	get_tls(CX)
   357  	MOVL	g(CX), BX
   358  	MOVL	BX, gobuf_g(AX)
   359  	RET
   360  
   361  // void gogo(Gobuf*)
   362  // restore state from Gobuf; longjmp
   363  TEXT runtime·gogo(SB), NOSPLIT, $8-4
   364  	MOVL	buf+0(FP), BX		// gobuf
   365  	MOVL	gobuf_g(BX), DX
   366  	MOVL	0(DX), CX		// make sure g != nil
   367  	get_tls(CX)
   368  	MOVL	DX, g(CX)
   369  	MOVL	gobuf_sp(BX), SP	// restore SP
   370  	MOVL	gobuf_ret(BX), AX
   371  	MOVL	gobuf_ctxt(BX), DX
   372  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   373  	MOVL	$0, gobuf_ret(BX)
   374  	MOVL	$0, gobuf_ctxt(BX)
   375  	MOVL	gobuf_pc(BX), BX
   376  	JMP	BX
   377  
   378  // func mcall(fn func(*g))
   379  // Switch to m->g0's stack, call fn(g).
   380  // Fn must never return. It should gogo(&g->sched)
   381  // to keep running g.
   382  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   383  	MOVL	fn+0(FP), DI
   384  
   385  	get_tls(DX)
   386  	MOVL	g(DX), AX	// save state in g->sched
   387  	MOVL	0(SP), BX	// caller's PC
   388  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   389  	LEAL	fn+0(FP), BX	// caller's SP
   390  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   391  	MOVL	AX, (g_sched+gobuf_g)(AX)
   392  
   393  	// switch to m->g0 & its stack, call fn
   394  	MOVL	g(DX), BX
   395  	MOVL	g_m(BX), BX
   396  	MOVL	m_g0(BX), SI
   397  	CMPL	SI, AX	// if g == m->g0 call badmcall
   398  	JNE	3(PC)
   399  	MOVL	$runtime·badmcall(SB), AX
   400  	JMP	AX
   401  	MOVL	SI, g(DX)	// g = m->g0
   402  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   403  	PUSHL	AX
   404  	MOVL	DI, DX
   405  	MOVL	0(DI), DI
   406  	CALL	DI
   407  	POPL	AX
   408  	MOVL	$runtime·badmcall2(SB), AX
   409  	JMP	AX
   410  	RET
   411  
   412  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   413  // of the G stack. We need to distinguish the routine that
   414  // lives at the bottom of the G stack from the one that lives
   415  // at the top of the system stack because the one at the top of
   416  // the system stack terminates the stack walk (see topofstack()).
   417  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   418  	RET
   419  
   420  // func systemstack(fn func())
   421  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   422  	MOVL	fn+0(FP), DI	// DI = fn
   423  	get_tls(CX)
   424  	MOVL	g(CX), AX	// AX = g
   425  	MOVL	g_m(AX), BX	// BX = m
   426  
   427  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   428  	CMPL	AX, DX
   429  	JEQ	noswitch
   430  
   431  	MOVL	m_g0(BX), DX	// DX = g0
   432  	CMPL	AX, DX
   433  	JEQ	noswitch
   434  
   435  	MOVL	m_curg(BX), BP
   436  	CMPL	AX, BP
   437  	JEQ	switch
   438  	
   439  	// Bad: g is not gsignal, not g0, not curg. What is it?
   440  	// Hide call from linker nosplit analysis.
   441  	MOVL	$runtime·badsystemstack(SB), AX
   442  	CALL	AX
   443  
   444  switch:
   445  	// save our state in g->sched. Pretend to
   446  	// be systemstack_switch if the G stack is scanned.
   447  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   448  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   449  	MOVL	AX, (g_sched+gobuf_g)(AX)
   450  
   451  	// switch to g0
   452  	get_tls(CX)
   453  	MOVL	DX, g(CX)
   454  	MOVL	(g_sched+gobuf_sp)(DX), BX
   455  	// make it look like mstart called systemstack on g0, to stop traceback
   456  	SUBL	$4, BX
   457  	MOVL	$runtime·mstart(SB), DX
   458  	MOVL	DX, 0(BX)
   459  	MOVL	BX, SP
   460  
   461  	// call target function
   462  	MOVL	DI, DX
   463  	MOVL	0(DI), DI
   464  	CALL	DI
   465  
   466  	// switch back to g
   467  	get_tls(CX)
   468  	MOVL	g(CX), AX
   469  	MOVL	g_m(AX), BX
   470  	MOVL	m_curg(BX), AX
   471  	MOVL	AX, g(CX)
   472  	MOVL	(g_sched+gobuf_sp)(AX), SP
   473  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   474  	RET
   475  
   476  noswitch:
   477  	// already on system stack; tail call the function
   478  	// Using a tail call here cleans up tracebacks since we won't stop
   479  	// at an intermediate systemstack.
   480  	MOVL	DI, DX
   481  	MOVL	0(DI), DI
   482  	JMP	DI
   483  
   484  /*
   485   * support for morestack
   486   */
   487  
   488  // Called during function prolog when more stack is needed.
   489  //
   490  // The traceback routines see morestack on a g0 as being
   491  // the top of a stack (for example, morestack calling newstack
   492  // calling the scheduler calling newm calling gc), so we must
   493  // record an argument size. For that purpose, it has no arguments.
   494  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   495  	// Cannot grow scheduler stack (m->g0).
   496  	get_tls(CX)
   497  	MOVL	g(CX), BX
   498  	MOVL	g_m(BX), BX
   499  	MOVL	m_g0(BX), SI
   500  	CMPL	g(CX), SI
   501  	JNE	3(PC)
   502  	CALL	runtime·badmorestackg0(SB)
   503  	INT	$3
   504  
   505  	// Cannot grow signal stack.
   506  	MOVL	m_gsignal(BX), SI
   507  	CMPL	g(CX), SI
   508  	JNE	3(PC)
   509  	CALL	runtime·badmorestackgsignal(SB)
   510  	INT	$3
   511  
   512  	// Called from f.
   513  	// Set m->morebuf to f's caller.
   514  	MOVL	4(SP), DI	// f's caller's PC
   515  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   516  	LEAL	8(SP), CX	// f's caller's SP
   517  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   518  	get_tls(CX)
   519  	MOVL	g(CX), SI
   520  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   521  
   522  	// Set g->sched to context in f.
   523  	MOVL	0(SP), AX	// f's PC
   524  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   525  	MOVL	SI, (g_sched+gobuf_g)(SI)
   526  	LEAL	4(SP), AX	// f's SP
   527  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   528  	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
   529  
   530  	// Call newstack on m->g0's stack.
   531  	MOVL	m_g0(BX), BP
   532  	MOVL	BP, g(CX)
   533  	MOVL	(g_sched+gobuf_sp)(BP), AX
   534  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   535  	MOVL	AX, SP
   536  	CALL	runtime·newstack(SB)
   537  	MOVL	$0, 0x1003	// crash if newstack returns
   538  	RET
   539  
   540  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   541  	MOVL	$0, DX
   542  	JMP runtime·morestack(SB)
   543  
   544  // reflectcall: call a function with the given argument list
   545  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   546  // we don't have variable-sized frames, so we use a small number
   547  // of constant-sized-frame functions to encode a few bits of size in the pc.
   548  // Caution: ugly multiline assembly macros in your future!
   549  
   550  #define DISPATCH(NAME,MAXSIZE)		\
   551  	CMPL	CX, $MAXSIZE;		\
   552  	JA	3(PC);			\
   553  	MOVL	$NAME(SB), AX;		\
   554  	JMP	AX
   555  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   556  
   557  TEXT reflect·call(SB), NOSPLIT, $0-0
   558  	JMP	·reflectcall(SB)
   559  
   560  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   561  	MOVL	argsize+12(FP), CX
   562  	DISPATCH(runtime·call16, 16)
   563  	DISPATCH(runtime·call32, 32)
   564  	DISPATCH(runtime·call64, 64)
   565  	DISPATCH(runtime·call128, 128)
   566  	DISPATCH(runtime·call256, 256)
   567  	DISPATCH(runtime·call512, 512)
   568  	DISPATCH(runtime·call1024, 1024)
   569  	DISPATCH(runtime·call2048, 2048)
   570  	DISPATCH(runtime·call4096, 4096)
   571  	DISPATCH(runtime·call8192, 8192)
   572  	DISPATCH(runtime·call16384, 16384)
   573  	DISPATCH(runtime·call32768, 32768)
   574  	DISPATCH(runtime·call65536, 65536)
   575  	DISPATCH(runtime·call131072, 131072)
   576  	DISPATCH(runtime·call262144, 262144)
   577  	DISPATCH(runtime·call524288, 524288)
   578  	DISPATCH(runtime·call1048576, 1048576)
   579  	DISPATCH(runtime·call2097152, 2097152)
   580  	DISPATCH(runtime·call4194304, 4194304)
   581  	DISPATCH(runtime·call8388608, 8388608)
   582  	DISPATCH(runtime·call16777216, 16777216)
   583  	DISPATCH(runtime·call33554432, 33554432)
   584  	DISPATCH(runtime·call67108864, 67108864)
   585  	DISPATCH(runtime·call134217728, 134217728)
   586  	DISPATCH(runtime·call268435456, 268435456)
   587  	DISPATCH(runtime·call536870912, 536870912)
   588  	DISPATCH(runtime·call1073741824, 1073741824)
   589  	MOVL	$runtime·badreflectcall(SB), AX
   590  	JMP	AX
   591  
   592  #define CALLFN(NAME,MAXSIZE)			\
   593  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   594  	NO_LOCAL_POINTERS;			\
   595  	/* copy arguments to stack */		\
   596  	MOVL	argptr+8(FP), SI;		\
   597  	MOVL	argsize+12(FP), CX;		\
   598  	MOVL	SP, DI;				\
   599  	REP;MOVSB;				\
   600  	/* call function */			\
   601  	MOVL	f+4(FP), DX;			\
   602  	MOVL	(DX), AX; 			\
   603  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   604  	CALL	AX;				\
   605  	/* copy return values back */		\
   606  	MOVL	argtype+0(FP), DX;		\
   607  	MOVL	argptr+8(FP), DI;		\
   608  	MOVL	argsize+12(FP), CX;		\
   609  	MOVL	retoffset+16(FP), BX;		\
   610  	MOVL	SP, SI;				\
   611  	ADDL	BX, DI;				\
   612  	ADDL	BX, SI;				\
   613  	SUBL	BX, CX;				\
   614  	CALL	callRet<>(SB);			\
   615  	RET
   616  
   617  // callRet copies return values back at the end of call*. This is a
   618  // separate function so it can allocate stack space for the arguments
   619  // to reflectcallmove. It does not follow the Go ABI; it expects its
   620  // arguments in registers.
   621  TEXT callRet<>(SB), NOSPLIT, $16-0
   622  	MOVL	DX, 0(SP)
   623  	MOVL	DI, 4(SP)
   624  	MOVL	SI, 8(SP)
   625  	MOVL	CX, 12(SP)
   626  	CALL	runtime·reflectcallmove(SB)
   627  	RET
   628  
   629  CALLFN(·call16, 16)
   630  CALLFN(·call32, 32)
   631  CALLFN(·call64, 64)
   632  CALLFN(·call128, 128)
   633  CALLFN(·call256, 256)
   634  CALLFN(·call512, 512)
   635  CALLFN(·call1024, 1024)
   636  CALLFN(·call2048, 2048)
   637  CALLFN(·call4096, 4096)
   638  CALLFN(·call8192, 8192)
   639  CALLFN(·call16384, 16384)
   640  CALLFN(·call32768, 32768)
   641  CALLFN(·call65536, 65536)
   642  CALLFN(·call131072, 131072)
   643  CALLFN(·call262144, 262144)
   644  CALLFN(·call524288, 524288)
   645  CALLFN(·call1048576, 1048576)
   646  CALLFN(·call2097152, 2097152)
   647  CALLFN(·call4194304, 4194304)
   648  CALLFN(·call8388608, 8388608)
   649  CALLFN(·call16777216, 16777216)
   650  CALLFN(·call33554432, 33554432)
   651  CALLFN(·call67108864, 67108864)
   652  CALLFN(·call134217728, 134217728)
   653  CALLFN(·call268435456, 268435456)
   654  CALLFN(·call536870912, 536870912)
   655  CALLFN(·call1073741824, 1073741824)
   656  
   657  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   658  	MOVL	cycles+0(FP), AX
   659  again:
   660  	PAUSE
   661  	SUBL	$1, AX
   662  	JNZ	again
   663  	RET
   664  
   665  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   666  	// Stores are already ordered on x86, so this is just a
   667  	// compile barrier.
   668  	RET
   669  
   670  // void jmpdefer(fn, sp);
   671  // called from deferreturn.
   672  // 1. pop the caller
   673  // 2. sub 5 bytes (the length of CALL & a 32 bit displacement) from the callers
   674  //    return (when building for shared libraries, subtract 16 bytes -- 5 bytes
   675  //    for CALL & displacement to call __x86.get_pc_thunk.cx, 6 bytes for the
   676  //    LEAL to load the offset into BX, and finally 5 for the call & displacement)
   677  // 3. jmp to the argument
   678  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   679  	MOVL	fv+0(FP), DX	// fn
   680  	MOVL	argp+4(FP), BX	// caller sp
   681  	LEAL	-4(BX), SP	// caller sp after CALL
   682  #ifdef GOBUILDMODE_shared
   683  	SUBL	$16, (SP)	// return to CALL again
   684  #else
   685  	SUBL	$5, (SP)	// return to CALL again
   686  #endif
   687  	MOVL	0(DX), BX
   688  	JMP	BX	// but first run the deferred function
   689  
   690  // Save state of caller into g->sched.
   691  TEXT gosave<>(SB),NOSPLIT,$0
   692  	PUSHL	AX
   693  	PUSHL	BX
   694  	get_tls(BX)
   695  	MOVL	g(BX), BX
   696  	LEAL	arg+0(FP), AX
   697  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   698  	MOVL	-4(AX), AX
   699  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   700  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   701  	// Assert ctxt is zero. See func save.
   702  	MOVL	(g_sched+gobuf_ctxt)(BX), AX
   703  	TESTL	AX, AX
   704  	JZ	2(PC)
   705  	CALL	runtime·badctxt(SB)
   706  	POPL	BX
   707  	POPL	AX
   708  	RET
   709  
   710  // func asmcgocall(fn, arg unsafe.Pointer) int32
   711  // Call fn(arg) on the scheduler stack,
   712  // aligned appropriately for the gcc ABI.
   713  // See cgocall.go for more details.
   714  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   715  	MOVL	fn+0(FP), AX
   716  	MOVL	arg+4(FP), BX
   717  
   718  	MOVL	SP, DX
   719  
   720  	// Figure out if we need to switch to m->g0 stack.
   721  	// We get called to create new OS threads too, and those
   722  	// come in on the m->g0 stack already.
   723  	get_tls(CX)
   724  	MOVL	g(CX), BP
   725  	MOVL	g_m(BP), BP
   726  	MOVL	m_g0(BP), SI
   727  	MOVL	g(CX), DI
   728  	CMPL	SI, DI
   729  	JEQ	noswitch
   730  	CALL	gosave<>(SB)
   731  	get_tls(CX)
   732  	MOVL	SI, g(CX)
   733  	MOVL	(g_sched+gobuf_sp)(SI), SP
   734  
   735  noswitch:
   736  	// Now on a scheduling stack (a pthread-created stack).
   737  	SUBL	$32, SP
   738  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   739  	MOVL	DI, 8(SP)	// save g
   740  	MOVL	(g_stack+stack_hi)(DI), DI
   741  	SUBL	DX, DI
   742  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   743  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   744  	CALL	AX
   745  
   746  	// Restore registers, g, stack pointer.
   747  	get_tls(CX)
   748  	MOVL	8(SP), DI
   749  	MOVL	(g_stack+stack_hi)(DI), SI
   750  	SUBL	4(SP), SI
   751  	MOVL	DI, g(CX)
   752  	MOVL	SI, SP
   753  
   754  	MOVL	AX, ret+8(FP)
   755  	RET
   756  
   757  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   758  // Turn the fn into a Go func (by taking its address) and call
   759  // cgocallback_gofunc.
   760  TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
   761  	LEAL	fn+0(FP), AX
   762  	MOVL	AX, 0(SP)
   763  	MOVL	frame+4(FP), AX
   764  	MOVL	AX, 4(SP)
   765  	MOVL	framesize+8(FP), AX
   766  	MOVL	AX, 8(SP)
   767  	MOVL	ctxt+12(FP), AX
   768  	MOVL	AX, 12(SP)
   769  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   770  	CALL	AX
   771  	RET
   772  
   773  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   774  // See cgocall.go for more details.
   775  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
   776  	NO_LOCAL_POINTERS
   777  
   778  	// If g is nil, Go did not create the current thread.
   779  	// Call needm to obtain one for temporary use.
   780  	// In this case, we're running on the thread stack, so there's
   781  	// lots of space, but the linker doesn't know. Hide the call from
   782  	// the linker analysis by using an indirect call through AX.
   783  	get_tls(CX)
   784  #ifdef GOOS_windows
   785  	MOVL	$0, BP
   786  	CMPL	CX, $0
   787  	JEQ	2(PC) // TODO
   788  #endif
   789  	MOVL	g(CX), BP
   790  	CMPL	BP, $0
   791  	JEQ	needm
   792  	MOVL	g_m(BP), BP
   793  	MOVL	BP, DX // saved copy of oldm
   794  	JMP	havem
   795  needm:
   796  	MOVL	$0, 0(SP)
   797  	MOVL	$runtime·needm(SB), AX
   798  	CALL	AX
   799  	MOVL	0(SP), DX
   800  	get_tls(CX)
   801  	MOVL	g(CX), BP
   802  	MOVL	g_m(BP), BP
   803  
   804  	// Set m->sched.sp = SP, so that if a panic happens
   805  	// during the function we are about to execute, it will
   806  	// have a valid SP to run on the g0 stack.
   807  	// The next few lines (after the havem label)
   808  	// will save this SP onto the stack and then write
   809  	// the same SP back to m->sched.sp. That seems redundant,
   810  	// but if an unrecovered panic happens, unwindm will
   811  	// restore the g->sched.sp from the stack location
   812  	// and then systemstack will try to use it. If we don't set it here,
   813  	// that restored SP will be uninitialized (typically 0) and
   814  	// will not be usable.
   815  	MOVL	m_g0(BP), SI
   816  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   817  
   818  havem:
   819  	// Now there's a valid m, and we're running on its m->g0.
   820  	// Save current m->g0->sched.sp on stack and then set it to SP.
   821  	// Save current sp in m->g0->sched.sp in preparation for
   822  	// switch back to m->curg stack.
   823  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   824  	MOVL	m_g0(BP), SI
   825  	MOVL	(g_sched+gobuf_sp)(SI), AX
   826  	MOVL	AX, 0(SP)
   827  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   828  
   829  	// Switch to m->curg stack and call runtime.cgocallbackg.
   830  	// Because we are taking over the execution of m->curg
   831  	// but *not* resuming what had been running, we need to
   832  	// save that information (m->curg->sched) so we can restore it.
   833  	// We can restore m->curg->sched.sp easily, because calling
   834  	// runtime.cgocallbackg leaves SP unchanged upon return.
   835  	// To save m->curg->sched.pc, we push it onto the stack.
   836  	// This has the added benefit that it looks to the traceback
   837  	// routine like cgocallbackg is going to return to that
   838  	// PC (because the frame we allocate below has the same
   839  	// size as cgocallback_gofunc's frame declared above)
   840  	// so that the traceback will seamlessly trace back into
   841  	// the earlier calls.
   842  	//
   843  	// In the new goroutine, 4(SP) holds the saved oldm (DX) register.
   844  	// 8(SP) is unused.
   845  	MOVL	m_curg(BP), SI
   846  	MOVL	SI, g(CX)
   847  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   848  	MOVL	(g_sched+gobuf_pc)(SI), BP
   849  	MOVL	BP, -4(DI)
   850  	MOVL	ctxt+12(FP), CX
   851  	LEAL	-(4+12)(DI), SP
   852  	MOVL	DX, 4(SP)
   853  	MOVL	CX, 0(SP)
   854  	CALL	runtime·cgocallbackg(SB)
   855  	MOVL	4(SP), DX
   856  
   857  	// Restore g->sched (== m->curg->sched) from saved values.
   858  	get_tls(CX)
   859  	MOVL	g(CX), SI
   860  	MOVL	12(SP), BP
   861  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   862  	LEAL	(12+4)(SP), DI
   863  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   864  
   865  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   866  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   867  	// so we do not have to restore it.)
   868  	MOVL	g(CX), BP
   869  	MOVL	g_m(BP), BP
   870  	MOVL	m_g0(BP), SI
   871  	MOVL	SI, g(CX)
   872  	MOVL	(g_sched+gobuf_sp)(SI), SP
   873  	MOVL	0(SP), AX
   874  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   875  	
   876  	// If the m on entry was nil, we called needm above to borrow an m
   877  	// for the duration of the call. Since the call is over, return it with dropm.
   878  	CMPL	DX, $0
   879  	JNE 3(PC)
   880  	MOVL	$runtime·dropm(SB), AX
   881  	CALL	AX
   882  
   883  	// Done!
   884  	RET
   885  
   886  // void setg(G*); set g. for use by needm.
   887  TEXT runtime·setg(SB), NOSPLIT, $0-4
   888  	MOVL	gg+0(FP), BX
   889  #ifdef GOOS_windows
   890  	CMPL	BX, $0
   891  	JNE	settls
   892  	MOVL	$0, 0x14(FS)
   893  	RET
   894  settls:
   895  	MOVL	g_m(BX), AX
   896  	LEAL	m_tls(AX), AX
   897  	MOVL	AX, 0x14(FS)
   898  #endif
   899  	get_tls(CX)
   900  	MOVL	BX, g(CX)
   901  	RET
   902  
   903  // void setg_gcc(G*); set g. for use by gcc
   904  TEXT setg_gcc<>(SB), NOSPLIT, $0
   905  	get_tls(AX)
   906  	MOVL	gg+0(FP), DX
   907  	MOVL	DX, g(AX)
   908  	RET
   909  
   910  // check that SP is in range [g->stack.lo, g->stack.hi)
   911  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   912  	get_tls(CX)
   913  	MOVL	g(CX), AX
   914  	CMPL	(g_stack+stack_hi)(AX), SP
   915  	JHI	2(PC)
   916  	INT	$3
   917  	CMPL	SP, (g_stack+stack_lo)(AX)
   918  	JHI	2(PC)
   919  	INT	$3
   920  	RET
   921  
   922  // func cputicks() int64
   923  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   924  	CMPB	runtime·support_sse2(SB), $1
   925  	JNE	done
   926  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   927  	JNE	mfence
   928  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   929  	JMP	done
   930  mfence:
   931  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   932  done:
   933  	RDTSC
   934  	MOVL	AX, ret_lo+0(FP)
   935  	MOVL	DX, ret_hi+4(FP)
   936  	RET
   937  
   938  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   939  	// set up ldt 7 to point at m0.tls
   940  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   941  	// the entry number is just a hint.  setldt will set up GS with what it used.
   942  	MOVL	$7, 0(SP)
   943  	LEAL	runtime·m0+m_tls(SB), AX
   944  	MOVL	AX, 4(SP)
   945  	MOVL	$32, 8(SP)	// sizeof(tls array)
   946  	CALL	runtime·setldt(SB)
   947  	RET
   948  
   949  TEXT runtime·emptyfunc(SB),0,$0-0
   950  	RET
   951  
   952  // hash function using AES hardware instructions
   953  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   954  	MOVL	p+0(FP), AX	// ptr to data
   955  	MOVL	s+8(FP), BX	// size
   956  	LEAL	ret+12(FP), DX
   957  	JMP	runtime·aeshashbody(SB)
   958  
   959  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   960  	MOVL	p+0(FP), AX	// ptr to string object
   961  	MOVL	4(AX), BX	// length of string
   962  	MOVL	(AX), AX	// string data
   963  	LEAL	ret+8(FP), DX
   964  	JMP	runtime·aeshashbody(SB)
   965  
   966  // AX: data
   967  // BX: length
   968  // DX: address to put return value
   969  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   970  	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
   971  	PINSRW	$4, BX, X0	            // 16 bits of length
   972  	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
   973  	MOVO	X0, X1                      // save unscrambled seed
   974  	PXOR	runtime·aeskeysched(SB), X0 // xor in per-process seed
   975  	AESENC	X0, X0                      // scramble seed
   976  
   977  	CMPL	BX, $16
   978  	JB	aes0to15
   979  	JE	aes16
   980  	CMPL	BX, $32
   981  	JBE	aes17to32
   982  	CMPL	BX, $64
   983  	JBE	aes33to64
   984  	JMP	aes65plus
   985  	
   986  aes0to15:
   987  	TESTL	BX, BX
   988  	JE	aes0
   989  
   990  	ADDL	$16, AX
   991  	TESTW	$0xff0, AX
   992  	JE	endofpage
   993  
   994  	// 16 bytes loaded at this address won't cross
   995  	// a page boundary, so we can load it directly.
   996  	MOVOU	-16(AX), X1
   997  	ADDL	BX, BX
   998  	PAND	masks<>(SB)(BX*8), X1
   999  
  1000  final1:	
  1001  	AESENC	X0, X1  // scramble input, xor in seed
  1002  	AESENC	X1, X1  // scramble combo 2 times
  1003  	AESENC	X1, X1
  1004  	MOVL	X1, (DX)
  1005  	RET
  1006  
  1007  endofpage:
  1008  	// address ends in 1111xxxx. Might be up against
  1009  	// a page boundary, so load ending at last byte.
  1010  	// Then shift bytes down using pshufb.
  1011  	MOVOU	-32(AX)(BX*1), X1
  1012  	ADDL	BX, BX
  1013  	PSHUFB	shifts<>(SB)(BX*8), X1
  1014  	JMP	final1
  1015  
  1016  aes0:
  1017  	// Return scrambled input seed
  1018  	AESENC	X0, X0
  1019  	MOVL	X0, (DX)
  1020  	RET
  1021  
  1022  aes16:
  1023  	MOVOU	(AX), X1
  1024  	JMP	final1
  1025  
  1026  aes17to32:
  1027  	// make second starting seed
  1028  	PXOR	runtime·aeskeysched+16(SB), X1
  1029  	AESENC	X1, X1
  1030  	
  1031  	// load data to be hashed
  1032  	MOVOU	(AX), X2
  1033  	MOVOU	-16(AX)(BX*1), X3
  1034  
  1035  	// scramble 3 times
  1036  	AESENC	X0, X2
  1037  	AESENC	X1, X3
  1038  	AESENC	X2, X2
  1039  	AESENC	X3, X3
  1040  	AESENC	X2, X2
  1041  	AESENC	X3, X3
  1042  
  1043  	// combine results
  1044  	PXOR	X3, X2
  1045  	MOVL	X2, (DX)
  1046  	RET
  1047  
  1048  aes33to64:
  1049  	// make 3 more starting seeds
  1050  	MOVO	X1, X2
  1051  	MOVO	X1, X3
  1052  	PXOR	runtime·aeskeysched+16(SB), X1
  1053  	PXOR	runtime·aeskeysched+32(SB), X2
  1054  	PXOR	runtime·aeskeysched+48(SB), X3
  1055  	AESENC	X1, X1
  1056  	AESENC	X2, X2
  1057  	AESENC	X3, X3
  1058  	
  1059  	MOVOU	(AX), X4
  1060  	MOVOU	16(AX), X5
  1061  	MOVOU	-32(AX)(BX*1), X6
  1062  	MOVOU	-16(AX)(BX*1), X7
  1063  	
  1064  	AESENC	X0, X4
  1065  	AESENC	X1, X5
  1066  	AESENC	X2, X6
  1067  	AESENC	X3, X7
  1068  	
  1069  	AESENC	X4, X4
  1070  	AESENC	X5, X5
  1071  	AESENC	X6, X6
  1072  	AESENC	X7, X7
  1073  	
  1074  	AESENC	X4, X4
  1075  	AESENC	X5, X5
  1076  	AESENC	X6, X6
  1077  	AESENC	X7, X7
  1078  
  1079  	PXOR	X6, X4
  1080  	PXOR	X7, X5
  1081  	PXOR	X5, X4
  1082  	MOVL	X4, (DX)
  1083  	RET
  1084  
  1085  aes65plus:
  1086  	// make 3 more starting seeds
  1087  	MOVO	X1, X2
  1088  	MOVO	X1, X3
  1089  	PXOR	runtime·aeskeysched+16(SB), X1
  1090  	PXOR	runtime·aeskeysched+32(SB), X2
  1091  	PXOR	runtime·aeskeysched+48(SB), X3
  1092  	AESENC	X1, X1
  1093  	AESENC	X2, X2
  1094  	AESENC	X3, X3
  1095  	
  1096  	// start with last (possibly overlapping) block
  1097  	MOVOU	-64(AX)(BX*1), X4
  1098  	MOVOU	-48(AX)(BX*1), X5
  1099  	MOVOU	-32(AX)(BX*1), X6
  1100  	MOVOU	-16(AX)(BX*1), X7
  1101  
  1102  	// scramble state once
  1103  	AESENC	X0, X4
  1104  	AESENC	X1, X5
  1105  	AESENC	X2, X6
  1106  	AESENC	X3, X7
  1107  
  1108  	// compute number of remaining 64-byte blocks
  1109  	DECL	BX
  1110  	SHRL	$6, BX
  1111  	
  1112  aesloop:
  1113  	// scramble state, xor in a block
  1114  	MOVOU	(AX), X0
  1115  	MOVOU	16(AX), X1
  1116  	MOVOU	32(AX), X2
  1117  	MOVOU	48(AX), X3
  1118  	AESENC	X0, X4
  1119  	AESENC	X1, X5
  1120  	AESENC	X2, X6
  1121  	AESENC	X3, X7
  1122  
  1123  	// scramble state
  1124  	AESENC	X4, X4
  1125  	AESENC	X5, X5
  1126  	AESENC	X6, X6
  1127  	AESENC	X7, X7
  1128  
  1129  	ADDL	$64, AX
  1130  	DECL	BX
  1131  	JNE	aesloop
  1132  
  1133  	// 2 more scrambles to finish
  1134  	AESENC	X4, X4
  1135  	AESENC	X5, X5
  1136  	AESENC	X6, X6
  1137  	AESENC	X7, X7
  1138  	
  1139  	AESENC	X4, X4
  1140  	AESENC	X5, X5
  1141  	AESENC	X6, X6
  1142  	AESENC	X7, X7
  1143  
  1144  	PXOR	X6, X4
  1145  	PXOR	X7, X5
  1146  	PXOR	X5, X4
  1147  	MOVL	X4, (DX)
  1148  	RET
  1149  
  1150  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1151  	MOVL	p+0(FP), AX	// ptr to data
  1152  	MOVL	h+4(FP), X0	// seed
  1153  	PINSRD	$1, (AX), X0	// data
  1154  	AESENC	runtime·aeskeysched+0(SB), X0
  1155  	AESENC	runtime·aeskeysched+16(SB), X0
  1156  	AESENC	runtime·aeskeysched+32(SB), X0
  1157  	MOVL	X0, ret+8(FP)
  1158  	RET
  1159  
  1160  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1161  	MOVL	p+0(FP), AX	// ptr to data
  1162  	MOVQ	(AX), X0	// data
  1163  	PINSRD	$2, h+4(FP), X0	// seed
  1164  	AESENC	runtime·aeskeysched+0(SB), X0
  1165  	AESENC	runtime·aeskeysched+16(SB), X0
  1166  	AESENC	runtime·aeskeysched+32(SB), X0
  1167  	MOVL	X0, ret+8(FP)
  1168  	RET
  1169  
  1170  // simple mask to get rid of data in the high part of the register.
  1171  DATA masks<>+0x00(SB)/4, $0x00000000
  1172  DATA masks<>+0x04(SB)/4, $0x00000000
  1173  DATA masks<>+0x08(SB)/4, $0x00000000
  1174  DATA masks<>+0x0c(SB)/4, $0x00000000
  1175  	
  1176  DATA masks<>+0x10(SB)/4, $0x000000ff
  1177  DATA masks<>+0x14(SB)/4, $0x00000000
  1178  DATA masks<>+0x18(SB)/4, $0x00000000
  1179  DATA masks<>+0x1c(SB)/4, $0x00000000
  1180  	
  1181  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1182  DATA masks<>+0x24(SB)/4, $0x00000000
  1183  DATA masks<>+0x28(SB)/4, $0x00000000
  1184  DATA masks<>+0x2c(SB)/4, $0x00000000
  1185  	
  1186  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1187  DATA masks<>+0x34(SB)/4, $0x00000000
  1188  DATA masks<>+0x38(SB)/4, $0x00000000
  1189  DATA masks<>+0x3c(SB)/4, $0x00000000
  1190  	
  1191  DATA masks<>+0x40(SB)/4, $0xffffffff
  1192  DATA masks<>+0x44(SB)/4, $0x00000000
  1193  DATA masks<>+0x48(SB)/4, $0x00000000
  1194  DATA masks<>+0x4c(SB)/4, $0x00000000
  1195  	
  1196  DATA masks<>+0x50(SB)/4, $0xffffffff
  1197  DATA masks<>+0x54(SB)/4, $0x000000ff
  1198  DATA masks<>+0x58(SB)/4, $0x00000000
  1199  DATA masks<>+0x5c(SB)/4, $0x00000000
  1200  	
  1201  DATA masks<>+0x60(SB)/4, $0xffffffff
  1202  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1203  DATA masks<>+0x68(SB)/4, $0x00000000
  1204  DATA masks<>+0x6c(SB)/4, $0x00000000
  1205  	
  1206  DATA masks<>+0x70(SB)/4, $0xffffffff
  1207  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1208  DATA masks<>+0x78(SB)/4, $0x00000000
  1209  DATA masks<>+0x7c(SB)/4, $0x00000000
  1210  	
  1211  DATA masks<>+0x80(SB)/4, $0xffffffff
  1212  DATA masks<>+0x84(SB)/4, $0xffffffff
  1213  DATA masks<>+0x88(SB)/4, $0x00000000
  1214  DATA masks<>+0x8c(SB)/4, $0x00000000
  1215  	
  1216  DATA masks<>+0x90(SB)/4, $0xffffffff
  1217  DATA masks<>+0x94(SB)/4, $0xffffffff
  1218  DATA masks<>+0x98(SB)/4, $0x000000ff
  1219  DATA masks<>+0x9c(SB)/4, $0x00000000
  1220  	
  1221  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1222  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1223  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1224  DATA masks<>+0xac(SB)/4, $0x00000000
  1225  	
  1226  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1227  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1228  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1229  DATA masks<>+0xbc(SB)/4, $0x00000000
  1230  	
  1231  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1232  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1233  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1234  DATA masks<>+0xcc(SB)/4, $0x00000000
  1235  	
  1236  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1237  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1238  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1239  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1240  	
  1241  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1242  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1243  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1244  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1245  	
  1246  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1247  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1248  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1249  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1250  
  1251  GLOBL masks<>(SB),RODATA,$256
  1252  
  1253  // these are arguments to pshufb. They move data down from
  1254  // the high bytes of the register to the low bytes of the register.
  1255  // index is how many bytes to move.
  1256  DATA shifts<>+0x00(SB)/4, $0x00000000
  1257  DATA shifts<>+0x04(SB)/4, $0x00000000
  1258  DATA shifts<>+0x08(SB)/4, $0x00000000
  1259  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1260  	
  1261  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1262  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1263  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1264  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1265  	
  1266  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1267  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1268  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1269  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1270  	
  1271  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1272  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1273  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1274  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1275  	
  1276  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1277  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1278  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1279  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1280  	
  1281  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1282  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1283  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1284  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1285  	
  1286  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1287  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1288  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1289  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1290  	
  1291  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1292  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1293  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1294  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1295  	
  1296  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1297  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1298  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1299  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1300  	
  1301  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1302  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1303  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1304  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1305  	
  1306  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1307  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1308  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1309  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1310  	
  1311  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1312  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1313  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1314  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1315  	
  1316  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1317  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1318  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1319  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1320  	
  1321  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1322  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1323  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1324  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1325  	
  1326  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1327  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1328  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1329  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1330  	
  1331  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1332  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1333  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1334  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1335  
  1336  GLOBL shifts<>(SB),RODATA,$256
  1337  
  1338  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1339  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1340  	MOVL	$masks<>(SB), AX
  1341  	MOVL	$shifts<>(SB), BX
  1342  	ORL	BX, AX
  1343  	TESTL	$15, AX
  1344  	SETEQ	ret+0(FP)
  1345  	RET
  1346  
  1347  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1348  TEXT runtime·memequal(SB),NOSPLIT,$0-13
  1349  	MOVL	a+0(FP), SI
  1350  	MOVL	b+4(FP), DI
  1351  	CMPL	SI, DI
  1352  	JEQ	eq
  1353  	MOVL	size+8(FP), BX
  1354  	LEAL	ret+12(FP), AX
  1355  	JMP	runtime·memeqbody(SB)
  1356  eq:
  1357  	MOVB    $1, ret+12(FP)
  1358  	RET
  1359  
  1360  // memequal_varlen(a, b unsafe.Pointer) bool
  1361  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1362  	MOVL    a+0(FP), SI
  1363  	MOVL    b+4(FP), DI
  1364  	CMPL    SI, DI
  1365  	JEQ     eq
  1366  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1367  	LEAL	ret+8(FP), AX
  1368  	JMP	runtime·memeqbody(SB)
  1369  eq:
  1370  	MOVB    $1, ret+8(FP)
  1371  	RET
  1372  
  1373  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1374  	MOVL	a_len+4(FP), BX
  1375  	MOVL	b_len+16(FP), CX
  1376  	CMPL	BX, CX
  1377  	JNE	eqret
  1378  	MOVL	a+0(FP), SI
  1379  	MOVL	b+12(FP), DI
  1380  	LEAL	ret+24(FP), AX
  1381  	JMP	runtime·memeqbody(SB)
  1382  eqret:
  1383  	MOVB	$0, ret+24(FP)
  1384  	RET
  1385  
  1386  // a in SI
  1387  // b in DI
  1388  // count in BX
  1389  // address of result byte in AX
  1390  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1391  	CMPL	BX, $4
  1392  	JB	small
  1393  
  1394  	// 64 bytes at a time using xmm registers
  1395  hugeloop:
  1396  	CMPL	BX, $64
  1397  	JB	bigloop
  1398  	CMPB	runtime·support_sse2(SB), $1
  1399  	JNE	bigloop
  1400  	MOVOU	(SI), X0
  1401  	MOVOU	(DI), X1
  1402  	MOVOU	16(SI), X2
  1403  	MOVOU	16(DI), X3
  1404  	MOVOU	32(SI), X4
  1405  	MOVOU	32(DI), X5
  1406  	MOVOU	48(SI), X6
  1407  	MOVOU	48(DI), X7
  1408  	PCMPEQB	X1, X0
  1409  	PCMPEQB	X3, X2
  1410  	PCMPEQB	X5, X4
  1411  	PCMPEQB	X7, X6
  1412  	PAND	X2, X0
  1413  	PAND	X6, X4
  1414  	PAND	X4, X0
  1415  	PMOVMSKB X0, DX
  1416  	ADDL	$64, SI
  1417  	ADDL	$64, DI
  1418  	SUBL	$64, BX
  1419  	CMPL	DX, $0xffff
  1420  	JEQ	hugeloop
  1421  	MOVB	$0, (AX)
  1422  	RET
  1423  
  1424  	// 4 bytes at a time using 32-bit register
  1425  bigloop:
  1426  	CMPL	BX, $4
  1427  	JBE	leftover
  1428  	MOVL	(SI), CX
  1429  	MOVL	(DI), DX
  1430  	ADDL	$4, SI
  1431  	ADDL	$4, DI
  1432  	SUBL	$4, BX
  1433  	CMPL	CX, DX
  1434  	JEQ	bigloop
  1435  	MOVB	$0, (AX)
  1436  	RET
  1437  
  1438  	// remaining 0-4 bytes
  1439  leftover:
  1440  	MOVL	-4(SI)(BX*1), CX
  1441  	MOVL	-4(DI)(BX*1), DX
  1442  	CMPL	CX, DX
  1443  	SETEQ	(AX)
  1444  	RET
  1445  
  1446  small:
  1447  	CMPL	BX, $0
  1448  	JEQ	equal
  1449  
  1450  	LEAL	0(BX*8), CX
  1451  	NEGL	CX
  1452  
  1453  	MOVL	SI, DX
  1454  	CMPB	DX, $0xfc
  1455  	JA	si_high
  1456  
  1457  	// load at SI won't cross a page boundary.
  1458  	MOVL	(SI), SI
  1459  	JMP	si_finish
  1460  si_high:
  1461  	// address ends in 111111xx. Load up to bytes we want, move to correct position.
  1462  	MOVL	-4(SI)(BX*1), SI
  1463  	SHRL	CX, SI
  1464  si_finish:
  1465  
  1466  	// same for DI.
  1467  	MOVL	DI, DX
  1468  	CMPB	DX, $0xfc
  1469  	JA	di_high
  1470  	MOVL	(DI), DI
  1471  	JMP	di_finish
  1472  di_high:
  1473  	MOVL	-4(DI)(BX*1), DI
  1474  	SHRL	CX, DI
  1475  di_finish:
  1476  
  1477  	SUBL	SI, DI
  1478  	SHLL	CX, DI
  1479  equal:
  1480  	SETEQ	(AX)
  1481  	RET
  1482  
  1483  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1484  	MOVL	s1_base+0(FP), SI
  1485  	MOVL	s1_len+4(FP), BX
  1486  	MOVL	s2_base+8(FP), DI
  1487  	MOVL	s2_len+12(FP), DX
  1488  	LEAL	ret+16(FP), AX
  1489  	JMP	runtime·cmpbody(SB)
  1490  
  1491  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1492  	MOVL	s1+0(FP), SI
  1493  	MOVL	s1+4(FP), BX
  1494  	MOVL	s2+12(FP), DI
  1495  	MOVL	s2+16(FP), DX
  1496  	LEAL	ret+24(FP), AX
  1497  	JMP	runtime·cmpbody(SB)
  1498  
  1499  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1500  	MOVL	s+0(FP), SI
  1501  	MOVL	s_len+4(FP), CX
  1502  	MOVB	c+12(FP), AL
  1503  	MOVL	SI, DI
  1504  	CLD; REPN; SCASB
  1505  	JZ 3(PC)
  1506  	MOVL	$-1, ret+16(FP)
  1507  	RET
  1508  	SUBL	SI, DI
  1509  	SUBL	$1, DI
  1510  	MOVL	DI, ret+16(FP)
  1511  	RET
  1512  
  1513  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1514  	MOVL	s+0(FP), SI
  1515  	MOVL	s_len+4(FP), CX
  1516  	MOVB	c+8(FP), AL
  1517  	MOVL	SI, DI
  1518  	CLD; REPN; SCASB
  1519  	JZ 3(PC)
  1520  	MOVL	$-1, ret+12(FP)
  1521  	RET
  1522  	SUBL	SI, DI
  1523  	SUBL	$1, DI
  1524  	MOVL	DI, ret+12(FP)
  1525  	RET
  1526  
  1527  // input:
  1528  //   SI = a
  1529  //   DI = b
  1530  //   BX = alen
  1531  //   DX = blen
  1532  //   AX = address of return word (set to 1/0/-1)
  1533  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1534  	MOVL	DX, BP
  1535  	SUBL	BX, DX // DX = blen-alen
  1536  	JLE	2(PC)
  1537  	MOVL	BX, BP // BP = min(alen, blen)
  1538  	CMPL	SI, DI
  1539  	JEQ	allsame
  1540  	CMPL	BP, $4
  1541  	JB	small
  1542  	CMPB	runtime·support_sse2(SB), $1
  1543  	JNE	mediumloop
  1544  largeloop:
  1545  	CMPL	BP, $16
  1546  	JB	mediumloop
  1547  	MOVOU	(SI), X0
  1548  	MOVOU	(DI), X1
  1549  	PCMPEQB X0, X1
  1550  	PMOVMSKB X1, BX
  1551  	XORL	$0xffff, BX	// convert EQ to NE
  1552  	JNE	diff16	// branch if at least one byte is not equal
  1553  	ADDL	$16, SI
  1554  	ADDL	$16, DI
  1555  	SUBL	$16, BP
  1556  	JMP	largeloop
  1557  
  1558  diff16:
  1559  	BSFL	BX, BX	// index of first byte that differs
  1560  	XORL	DX, DX
  1561  	MOVB	(SI)(BX*1), CX
  1562  	CMPB	CX, (DI)(BX*1)
  1563  	SETHI	DX
  1564  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1565  	MOVL	DX, (AX)
  1566  	RET
  1567  
  1568  mediumloop:
  1569  	CMPL	BP, $4
  1570  	JBE	_0through4
  1571  	MOVL	(SI), BX
  1572  	MOVL	(DI), CX
  1573  	CMPL	BX, CX
  1574  	JNE	diff4
  1575  	ADDL	$4, SI
  1576  	ADDL	$4, DI
  1577  	SUBL	$4, BP
  1578  	JMP	mediumloop
  1579  
  1580  _0through4:
  1581  	MOVL	-4(SI)(BP*1), BX
  1582  	MOVL	-4(DI)(BP*1), CX
  1583  	CMPL	BX, CX
  1584  	JEQ	allsame
  1585  
  1586  diff4:
  1587  	BSWAPL	BX	// reverse order of bytes
  1588  	BSWAPL	CX
  1589  	XORL	BX, CX	// find bit differences
  1590  	BSRL	CX, CX	// index of highest bit difference
  1591  	SHRL	CX, BX	// move a's bit to bottom
  1592  	ANDL	$1, BX	// mask bit
  1593  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1594  	MOVL	BX, (AX)
  1595  	RET
  1596  
  1597  	// 0-3 bytes in common
  1598  small:
  1599  	LEAL	(BP*8), CX
  1600  	NEGL	CX
  1601  	JEQ	allsame
  1602  
  1603  	// load si
  1604  	CMPB	SI, $0xfc
  1605  	JA	si_high
  1606  	MOVL	(SI), SI
  1607  	JMP	si_finish
  1608  si_high:
  1609  	MOVL	-4(SI)(BP*1), SI
  1610  	SHRL	CX, SI
  1611  si_finish:
  1612  	SHLL	CX, SI
  1613  
  1614  	// same for di
  1615  	CMPB	DI, $0xfc
  1616  	JA	di_high
  1617  	MOVL	(DI), DI
  1618  	JMP	di_finish
  1619  di_high:
  1620  	MOVL	-4(DI)(BP*1), DI
  1621  	SHRL	CX, DI
  1622  di_finish:
  1623  	SHLL	CX, DI
  1624  
  1625  	BSWAPL	SI	// reverse order of bytes
  1626  	BSWAPL	DI
  1627  	XORL	SI, DI	// find bit differences
  1628  	JEQ	allsame
  1629  	BSRL	DI, CX	// index of highest bit difference
  1630  	SHRL	CX, SI	// move a's bit to bottom
  1631  	ANDL	$1, SI	// mask bit
  1632  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1633  	MOVL	BX, (AX)
  1634  	RET
  1635  
  1636  	// all the bytes in common are the same, so we just need
  1637  	// to compare the lengths.
  1638  allsame:
  1639  	XORL	BX, BX
  1640  	XORL	CX, CX
  1641  	TESTL	DX, DX
  1642  	SETLT	BX	// 1 if alen > blen
  1643  	SETEQ	CX	// 1 if alen == blen
  1644  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1645  	MOVL	BX, (AX)
  1646  	RET
  1647  
  1648  TEXT runtime·return0(SB), NOSPLIT, $0
  1649  	MOVL	$0, AX
  1650  	RET
  1651  
  1652  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1653  // Must obey the gcc calling convention.
  1654  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1655  	get_tls(CX)
  1656  	MOVL	g(CX), AX
  1657  	MOVL	g_m(AX), AX
  1658  	MOVL	m_curg(AX), AX
  1659  	MOVL	(g_stack+stack_hi)(AX), AX
  1660  	RET
  1661  
  1662  // The top-most function running on a goroutine
  1663  // returns to goexit+PCQuantum.
  1664  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1665  	BYTE	$0x90	// NOP
  1666  	CALL	runtime·goexit1(SB)	// does not return
  1667  	// traceback from goexit1 must hit code range of goexit
  1668  	BYTE	$0x90	// NOP
  1669  
  1670  // Add a module's moduledata to the linked list of moduledata objects. This
  1671  // is called from .init_array by a function generated in the linker and so
  1672  // follows the platform ABI wrt register preservation -- it only touches AX,
  1673  // CX (implicitly) and DX, but it does not follow the ABI wrt arguments:
  1674  // instead the pointer to the moduledata is passed in AX.
  1675  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1676         MOVL    runtime·lastmoduledatap(SB), DX
  1677         MOVL    AX, moduledata_next(DX)
  1678         MOVL    AX, runtime·lastmoduledatap(SB)
  1679         RET
  1680  
  1681  TEXT runtime·uint32tofloat64(SB),NOSPLIT,$8-12
  1682  	MOVL	a+0(FP), AX
  1683  	MOVL	AX, 0(SP)
  1684  	MOVL	$0, 4(SP)
  1685  	FMOVV	0(SP), F0
  1686  	FMOVDP	F0, ret+4(FP)
  1687  	RET
  1688  
  1689  TEXT runtime·float64touint32(SB),NOSPLIT,$12-12
  1690  	FMOVD	a+0(FP), F0
  1691  	FSTCW	0(SP)
  1692  	FLDCW	runtime·controlWord64trunc(SB)
  1693  	FMOVVP	F0, 4(SP)
  1694  	FLDCW	0(SP)
  1695  	MOVL	4(SP), AX
  1696  	MOVL	AX, ret+8(FP)
  1697  	RET