github.com/4ad/go@v0.0.0-20161219182952-69a12818b605/src/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	SUBL	$128, SP		// plenty of scratch
    15  	ANDL	$~15, SP
    16  	MOVL	AX, 120(SP)		// save argc, argv away
    17  	MOVL	BX, 124(SP)
    18  
    19  	// set default stack bounds.
    20  	// _cgo_init may update stackguard.
    21  	MOVL	$runtime·g0(SB), BP
    22  	LEAL	(-64*1024+104)(SP), BX
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	BX, g_stackguard1(BP)
    25  	MOVL	BX, (g_stack+stack_lo)(BP)
    26  	MOVL	SP, (g_stack+stack_hi)(BP)
    27  	
    28  	// find out information about the processor we're on
    29  #ifdef GOOS_nacl // NaCl doesn't like PUSHFL/POPFL
    30  	JMP 	has_cpuid
    31  #else
    32  	// first see if CPUID instruction is supported.
    33  	PUSHFL
    34  	PUSHFL
    35  	XORL	$(1<<21), 0(SP) // flip ID bit
    36  	POPFL
    37  	PUSHFL
    38  	POPL	AX
    39  	XORL	0(SP), AX
    40  	POPFL	// restore EFLAGS
    41  	TESTL	$(1<<21), AX
    42  	JNE 	has_cpuid
    43  #endif
    44  
    45  bad_proc: // show that the program requires MMX.
    46  	MOVL	$2, 0(SP)
    47  	MOVL	$bad_proc_msg<>(SB), 4(SP)
    48  	MOVL	$0x3d, 8(SP)
    49  	CALL	runtime·write(SB)
    50  	MOVL	$1, 0(SP)
    51  	CALL	runtime·exit(SB)
    52  	INT	$3
    53  
    54  has_cpuid:
    55  	MOVL	$0, AX
    56  	CPUID
    57  	MOVL	AX, SI
    58  	CMPL	AX, $0
    59  	JE	nocpuinfo
    60  
    61  	// Figure out how to serialize RDTSC.
    62  	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    63  	// Don't know about the rest, so let's do MFENCE.
    64  	CMPL	BX, $0x756E6547  // "Genu"
    65  	JNE	notintel
    66  	CMPL	DX, $0x49656E69  // "ineI"
    67  	JNE	notintel
    68  	CMPL	CX, $0x6C65746E  // "ntel"
    69  	JNE	notintel
    70  	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
    71  notintel:
    72  
    73  	// Load EAX=1 cpuid flags
    74  	MOVL	$1, AX
    75  	CPUID
    76  	MOVL	CX, AX // Move to global variable clobbers CX when generating PIC
    77  	MOVL	AX, runtime·cpuid_ecx(SB)
    78  	MOVL	DX, runtime·cpuid_edx(SB)
    79  
    80  	// Check for MMX support
    81  	TESTL	$(1<<23), DX	// MMX
    82  	JZ 	bad_proc
    83  
    84  	// Load EAX=7/ECX=0 cpuid flags
    85  	CMPL	SI, $7
    86  	JLT	nocpuinfo
    87  	MOVL	$7, AX
    88  	MOVL	$0, CX
    89  	CPUID
    90  	MOVL	BX, runtime·cpuid_ebx7(SB)
    91  
    92  nocpuinfo:	
    93  
    94  	// if there is an _cgo_init, call it to let it
    95  	// initialize and to set up GS.  if not,
    96  	// we set up GS ourselves.
    97  	MOVL	_cgo_init(SB), AX
    98  	TESTL	AX, AX
    99  	JZ	needtls
   100  	MOVL	$setg_gcc<>(SB), BX
   101  	MOVL	BX, 4(SP)
   102  	MOVL	BP, 0(SP)
   103  	CALL	AX
   104  
   105  	// update stackguard after _cgo_init
   106  	MOVL	$runtime·g0(SB), CX
   107  	MOVL	(g_stack+stack_lo)(CX), AX
   108  	ADDL	$const__StackGuard, AX
   109  	MOVL	AX, g_stackguard0(CX)
   110  	MOVL	AX, g_stackguard1(CX)
   111  
   112  #ifndef GOOS_windows
   113  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
   114  	JMP ok
   115  #endif
   116  needtls:
   117  #ifdef GOOS_plan9
   118  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
   119  	JMP	ok
   120  #endif
   121  
   122  	// set up %gs
   123  	CALL	runtime·ldt0setup(SB)
   124  
   125  	// store through it, to make sure it works
   126  	get_tls(BX)
   127  	MOVL	$0x123, g(BX)
   128  	MOVL	runtime·m0+m_tls(SB), AX
   129  	CMPL	AX, $0x123
   130  	JEQ	ok
   131  	MOVL	AX, 0	// abort
   132  ok:
   133  	// set up m and g "registers"
   134  	get_tls(BX)
   135  	LEAL	runtime·g0(SB), DX
   136  	MOVL	DX, g(BX)
   137  	LEAL	runtime·m0(SB), AX
   138  
   139  	// save m->g0 = g0
   140  	MOVL	DX, m_g0(AX)
   141  	// save g0->m = m0
   142  	MOVL	AX, g_m(DX)
   143  
   144  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
   145  
   146  	// convention is D is always cleared
   147  	CLD
   148  
   149  	CALL	runtime·check(SB)
   150  
   151  	// saved argc, argv
   152  	MOVL	120(SP), AX
   153  	MOVL	AX, 0(SP)
   154  	MOVL	124(SP), AX
   155  	MOVL	AX, 4(SP)
   156  	CALL	runtime·args(SB)
   157  	CALL	runtime·osinit(SB)
   158  	CALL	runtime·schedinit(SB)
   159  
   160  	// create a new goroutine to start program
   161  	PUSHL	$runtime·mainPC(SB)	// entry
   162  	PUSHL	$0	// arg size
   163  	CALL	runtime·newproc(SB)
   164  	POPL	AX
   165  	POPL	AX
   166  
   167  	// start this M
   168  	CALL	runtime·mstart(SB)
   169  
   170  	INT $3
   171  	RET
   172  
   173  DATA	bad_proc_msg<>+0x00(SB)/8, $"This pro"
   174  DATA	bad_proc_msg<>+0x08(SB)/8, $"gram can"
   175  DATA	bad_proc_msg<>+0x10(SB)/8, $" only be"
   176  DATA	bad_proc_msg<>+0x18(SB)/8, $" run on "
   177  DATA	bad_proc_msg<>+0x20(SB)/8, $"processo"
   178  DATA	bad_proc_msg<>+0x28(SB)/8, $"rs with "
   179  DATA	bad_proc_msg<>+0x30(SB)/8, $"MMX supp"
   180  DATA	bad_proc_msg<>+0x38(SB)/4, $"ort."
   181  DATA	bad_proc_msg<>+0x3c(SB)/1, $0xa
   182  GLOBL	bad_proc_msg<>(SB), RODATA, $0x3d
   183  
   184  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   185  GLOBL	runtime·mainPC(SB),RODATA,$4
   186  
   187  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   188  	INT $3
   189  	RET
   190  
   191  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   192  	// Linux and MinGW start the FPU in extended double precision.
   193  	// Other operating systems use double precision.
   194  	// Change to double precision to match them,
   195  	// and to match other hardware that only has double.
   196  	PUSHL $0x27F
   197  	FLDCW	0(SP)
   198  	POPL AX
   199  	RET
   200  
   201  /*
   202   *  go-routine
   203   */
   204  
   205  // void gosave(Gobuf*)
   206  // save state in Gobuf; setjmp
   207  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   208  	MOVL	buf+0(FP), AX		// gobuf
   209  	LEAL	buf+0(FP), BX		// caller's SP
   210  	MOVL	BX, gobuf_sp(AX)
   211  	MOVL	0(SP), BX		// caller's PC
   212  	MOVL	BX, gobuf_pc(AX)
   213  	MOVL	$0, gobuf_ret(AX)
   214  	MOVL	$0, gobuf_ctxt(AX)
   215  	get_tls(CX)
   216  	MOVL	g(CX), BX
   217  	MOVL	BX, gobuf_g(AX)
   218  	RET
   219  
   220  // void gogo(Gobuf*)
   221  // restore state from Gobuf; longjmp
   222  TEXT runtime·gogo(SB), NOSPLIT, $0-4
   223  	MOVL	buf+0(FP), BX		// gobuf
   224  	MOVL	gobuf_g(BX), DX
   225  	MOVL	0(DX), CX		// make sure g != nil
   226  	get_tls(CX)
   227  	MOVL	DX, g(CX)
   228  	MOVL	gobuf_sp(BX), SP	// restore SP
   229  	MOVL	gobuf_ret(BX), AX
   230  	MOVL	gobuf_ctxt(BX), DX
   231  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   232  	MOVL	$0, gobuf_ret(BX)
   233  	MOVL	$0, gobuf_ctxt(BX)
   234  	MOVL	gobuf_pc(BX), BX
   235  	JMP	BX
   236  
   237  // func mcall(fn func(*g))
   238  // Switch to m->g0's stack, call fn(g).
   239  // Fn must never return. It should gogo(&g->sched)
   240  // to keep running g.
   241  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   242  	MOVL	fn+0(FP), DI
   243  
   244  	get_tls(DX)
   245  	MOVL	g(DX), AX	// save state in g->sched
   246  	MOVL	0(SP), BX	// caller's PC
   247  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   248  	LEAL	fn+0(FP), BX	// caller's SP
   249  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   250  	MOVL	AX, (g_sched+gobuf_g)(AX)
   251  
   252  	// switch to m->g0 & its stack, call fn
   253  	MOVL	g(DX), BX
   254  	MOVL	g_m(BX), BX
   255  	MOVL	m_g0(BX), SI
   256  	CMPL	SI, AX	// if g == m->g0 call badmcall
   257  	JNE	3(PC)
   258  	MOVL	$runtime·badmcall(SB), AX
   259  	JMP	AX
   260  	MOVL	SI, g(DX)	// g = m->g0
   261  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   262  	PUSHL	AX
   263  	MOVL	DI, DX
   264  	MOVL	0(DI), DI
   265  	CALL	DI
   266  	POPL	AX
   267  	MOVL	$runtime·badmcall2(SB), AX
   268  	JMP	AX
   269  	RET
   270  
   271  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   272  // of the G stack. We need to distinguish the routine that
   273  // lives at the bottom of the G stack from the one that lives
   274  // at the top of the system stack because the one at the top of
   275  // the system stack terminates the stack walk (see topofstack()).
   276  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   277  	RET
   278  
   279  // func systemstack(fn func())
   280  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   281  	MOVL	fn+0(FP), DI	// DI = fn
   282  	get_tls(CX)
   283  	MOVL	g(CX), AX	// AX = g
   284  	MOVL	g_m(AX), BX	// BX = m
   285  
   286  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   287  	CMPL	AX, DX
   288  	JEQ	noswitch
   289  
   290  	MOVL	m_g0(BX), DX	// DX = g0
   291  	CMPL	AX, DX
   292  	JEQ	noswitch
   293  
   294  	MOVL	m_curg(BX), BP
   295  	CMPL	AX, BP
   296  	JEQ	switch
   297  	
   298  	// Bad: g is not gsignal, not g0, not curg. What is it?
   299  	// Hide call from linker nosplit analysis.
   300  	MOVL	$runtime·badsystemstack(SB), AX
   301  	CALL	AX
   302  
   303  switch:
   304  	// save our state in g->sched. Pretend to
   305  	// be systemstack_switch if the G stack is scanned.
   306  	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
   307  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   308  	MOVL	AX, (g_sched+gobuf_g)(AX)
   309  
   310  	// switch to g0
   311  	get_tls(CX)
   312  	MOVL	DX, g(CX)
   313  	MOVL	(g_sched+gobuf_sp)(DX), BX
   314  	// make it look like mstart called systemstack on g0, to stop traceback
   315  	SUBL	$4, BX
   316  	MOVL	$runtime·mstart(SB), DX
   317  	MOVL	DX, 0(BX)
   318  	MOVL	BX, SP
   319  
   320  	// call target function
   321  	MOVL	DI, DX
   322  	MOVL	0(DI), DI
   323  	CALL	DI
   324  
   325  	// switch back to g
   326  	get_tls(CX)
   327  	MOVL	g(CX), AX
   328  	MOVL	g_m(AX), BX
   329  	MOVL	m_curg(BX), AX
   330  	MOVL	AX, g(CX)
   331  	MOVL	(g_sched+gobuf_sp)(AX), SP
   332  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   333  	RET
   334  
   335  noswitch:
   336  	// already on system stack, just call directly
   337  	MOVL	DI, DX
   338  	MOVL	0(DI), DI
   339  	CALL	DI
   340  	RET
   341  
   342  /*
   343   * support for morestack
   344   */
   345  
   346  // Called during function prolog when more stack is needed.
   347  //
   348  // The traceback routines see morestack on a g0 as being
   349  // the top of a stack (for example, morestack calling newstack
   350  // calling the scheduler calling newm calling gc), so we must
   351  // record an argument size. For that purpose, it has no arguments.
   352  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   353  	// Cannot grow scheduler stack (m->g0).
   354  	get_tls(CX)
   355  	MOVL	g(CX), BX
   356  	MOVL	g_m(BX), BX
   357  	MOVL	m_g0(BX), SI
   358  	CMPL	g(CX), SI
   359  	JNE	2(PC)
   360  	INT	$3
   361  
   362  	// Cannot grow signal stack.
   363  	MOVL	m_gsignal(BX), SI
   364  	CMPL	g(CX), SI
   365  	JNE	2(PC)
   366  	INT	$3
   367  
   368  	// Called from f.
   369  	// Set m->morebuf to f's caller.
   370  	MOVL	4(SP), DI	// f's caller's PC
   371  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   372  	LEAL	8(SP), CX	// f's caller's SP
   373  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   374  	get_tls(CX)
   375  	MOVL	g(CX), SI
   376  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   377  
   378  	// Set g->sched to context in f.
   379  	MOVL	0(SP), AX	// f's PC
   380  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   381  	MOVL	SI, (g_sched+gobuf_g)(SI)
   382  	LEAL	4(SP), AX	// f's SP
   383  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   384  	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
   385  
   386  	// Call newstack on m->g0's stack.
   387  	MOVL	m_g0(BX), BP
   388  	MOVL	BP, g(CX)
   389  	MOVL	(g_sched+gobuf_sp)(BP), AX
   390  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   391  	MOVL	AX, SP
   392  	CALL	runtime·newstack(SB)
   393  	MOVL	$0, 0x1003	// crash if newstack returns
   394  	RET
   395  
   396  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
   397  	MOVL	$0, DX
   398  	JMP runtime·morestack(SB)
   399  
   400  TEXT runtime·stackBarrier(SB),NOSPLIT,$0
   401  	// We came here via a RET to an overwritten return PC.
   402  	// AX may be live. Other registers are available.
   403  
   404  	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
   405  	get_tls(CX)
   406  	MOVL	g(CX), CX
   407  	MOVL	(g_stkbar+slice_array)(CX), DX
   408  	MOVL	g_stkbarPos(CX), BX
   409  	IMULL	$stkbar__size, BX	// Too big for SIB.
   410  	MOVL	stkbar_savedLRVal(DX)(BX*1), BX
   411  	// Record that this stack barrier was hit.
   412  	ADDL	$1, g_stkbarPos(CX)
   413  	// Jump to the original return PC.
   414  	JMP	BX
   415  
   416  // reflectcall: call a function with the given argument list
   417  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   418  // we don't have variable-sized frames, so we use a small number
   419  // of constant-sized-frame functions to encode a few bits of size in the pc.
   420  // Caution: ugly multiline assembly macros in your future!
   421  
   422  #define DISPATCH(NAME,MAXSIZE)		\
   423  	CMPL	CX, $MAXSIZE;		\
   424  	JA	3(PC);			\
   425  	MOVL	$NAME(SB), AX;		\
   426  	JMP	AX
   427  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   428  
   429  TEXT reflect·call(SB), NOSPLIT, $0-0
   430  	JMP	·reflectcall(SB)
   431  
   432  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   433  	MOVL	argsize+12(FP), CX
   434  	DISPATCH(runtime·call16, 16)
   435  	DISPATCH(runtime·call32, 32)
   436  	DISPATCH(runtime·call64, 64)
   437  	DISPATCH(runtime·call128, 128)
   438  	DISPATCH(runtime·call256, 256)
   439  	DISPATCH(runtime·call512, 512)
   440  	DISPATCH(runtime·call1024, 1024)
   441  	DISPATCH(runtime·call2048, 2048)
   442  	DISPATCH(runtime·call4096, 4096)
   443  	DISPATCH(runtime·call8192, 8192)
   444  	DISPATCH(runtime·call16384, 16384)
   445  	DISPATCH(runtime·call32768, 32768)
   446  	DISPATCH(runtime·call65536, 65536)
   447  	DISPATCH(runtime·call131072, 131072)
   448  	DISPATCH(runtime·call262144, 262144)
   449  	DISPATCH(runtime·call524288, 524288)
   450  	DISPATCH(runtime·call1048576, 1048576)
   451  	DISPATCH(runtime·call2097152, 2097152)
   452  	DISPATCH(runtime·call4194304, 4194304)
   453  	DISPATCH(runtime·call8388608, 8388608)
   454  	DISPATCH(runtime·call16777216, 16777216)
   455  	DISPATCH(runtime·call33554432, 33554432)
   456  	DISPATCH(runtime·call67108864, 67108864)
   457  	DISPATCH(runtime·call134217728, 134217728)
   458  	DISPATCH(runtime·call268435456, 268435456)
   459  	DISPATCH(runtime·call536870912, 536870912)
   460  	DISPATCH(runtime·call1073741824, 1073741824)
   461  	MOVL	$runtime·badreflectcall(SB), AX
   462  	JMP	AX
   463  
   464  #define CALLFN(NAME,MAXSIZE)			\
   465  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   466  	NO_LOCAL_POINTERS;			\
   467  	/* copy arguments to stack */		\
   468  	MOVL	argptr+8(FP), SI;		\
   469  	MOVL	argsize+12(FP), CX;		\
   470  	MOVL	SP, DI;				\
   471  	REP;MOVSB;				\
   472  	/* call function */			\
   473  	MOVL	f+4(FP), DX;			\
   474  	MOVL	(DX), AX; 			\
   475  	PCDATA  $PCDATA_StackMapIndex, $0;	\
   476  	CALL	AX;				\
   477  	/* copy return values back */		\
   478  	MOVL	argptr+8(FP), DI;		\
   479  	MOVL	argsize+12(FP), CX;		\
   480  	MOVL	retoffset+16(FP), BX;		\
   481  	MOVL	SP, SI;				\
   482  	ADDL	BX, DI;				\
   483  	ADDL	BX, SI;				\
   484  	SUBL	BX, CX;				\
   485  	REP;MOVSB;				\
   486  	/* execute write barrier updates */	\
   487  	MOVL	argtype+0(FP), DX;		\
   488  	MOVL	argptr+8(FP), DI;		\
   489  	MOVL	argsize+12(FP), CX;		\
   490  	MOVL	retoffset+16(FP), BX;		\
   491  	MOVL	DX, 0(SP);			\
   492  	MOVL	DI, 4(SP);			\
   493  	MOVL	CX, 8(SP);			\
   494  	MOVL	BX, 12(SP);			\
   495  	CALL	runtime·callwritebarrier(SB);	\
   496  	RET
   497  
   498  CALLFN(·call16, 16)
   499  CALLFN(·call32, 32)
   500  CALLFN(·call64, 64)
   501  CALLFN(·call128, 128)
   502  CALLFN(·call256, 256)
   503  CALLFN(·call512, 512)
   504  CALLFN(·call1024, 1024)
   505  CALLFN(·call2048, 2048)
   506  CALLFN(·call4096, 4096)
   507  CALLFN(·call8192, 8192)
   508  CALLFN(·call16384, 16384)
   509  CALLFN(·call32768, 32768)
   510  CALLFN(·call65536, 65536)
   511  CALLFN(·call131072, 131072)
   512  CALLFN(·call262144, 262144)
   513  CALLFN(·call524288, 524288)
   514  CALLFN(·call1048576, 1048576)
   515  CALLFN(·call2097152, 2097152)
   516  CALLFN(·call4194304, 4194304)
   517  CALLFN(·call8388608, 8388608)
   518  CALLFN(·call16777216, 16777216)
   519  CALLFN(·call33554432, 33554432)
   520  CALLFN(·call67108864, 67108864)
   521  CALLFN(·call134217728, 134217728)
   522  CALLFN(·call268435456, 268435456)
   523  CALLFN(·call536870912, 536870912)
   524  CALLFN(·call1073741824, 1073741824)
   525  
   526  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   527  	MOVL	cycles+0(FP), AX
   528  again:
   529  	PAUSE
   530  	SUBL	$1, AX
   531  	JNZ	again
   532  	RET
   533  
   534  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   535  	// Stores are already ordered on x86, so this is just a
   536  	// compile barrier.
   537  	RET
   538  
   539  // void jmpdefer(fn, sp);
   540  // called from deferreturn.
   541  // 1. pop the caller
   542  // 2. sub 5 bytes (the length of CALL & a 32 bit displacement) from the callers
   543  //    return (when building for shared libraries, subtract 16 bytes -- 5 bytes
   544  //    for CALL & displacement to call __x86.get_pc_thunk.cx, 6 bytes for the
   545  //    LEAL to load the offset into BX, and finally 5 for the call & displacement)
   546  // 3. jmp to the argument
   547  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   548  	MOVL	fv+0(FP), DX	// fn
   549  	MOVL	argp+4(FP), BX	// caller sp
   550  	LEAL	-4(BX), SP	// caller sp after CALL
   551  #ifdef GOBUILDMODE_shared
   552  	SUBL	$16, (SP)	// return to CALL again
   553  #else
   554  	SUBL	$5, (SP)	// return to CALL again
   555  #endif
   556  	MOVL	0(DX), BX
   557  	JMP	BX	// but first run the deferred function
   558  
   559  // Save state of caller into g->sched.
   560  TEXT gosave<>(SB),NOSPLIT,$0
   561  	PUSHL	AX
   562  	PUSHL	BX
   563  	get_tls(BX)
   564  	MOVL	g(BX), BX
   565  	LEAL	arg+0(FP), AX
   566  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   567  	MOVL	-4(AX), AX
   568  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   569  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   570  	MOVL	$0, (g_sched+gobuf_ctxt)(BX)
   571  	POPL	BX
   572  	POPL	AX
   573  	RET
   574  
   575  // func asmcgocall(fn, arg unsafe.Pointer) int32
   576  // Call fn(arg) on the scheduler stack,
   577  // aligned appropriately for the gcc ABI.
   578  // See cgocall.go for more details.
   579  TEXT ·asmcgocall(SB),NOSPLIT,$0-12
   580  	MOVL	fn+0(FP), AX
   581  	MOVL	arg+4(FP), BX
   582  
   583  	MOVL	SP, DX
   584  
   585  	// Figure out if we need to switch to m->g0 stack.
   586  	// We get called to create new OS threads too, and those
   587  	// come in on the m->g0 stack already.
   588  	get_tls(CX)
   589  	MOVL	g(CX), BP
   590  	MOVL	g_m(BP), BP
   591  	MOVL	m_g0(BP), SI
   592  	MOVL	g(CX), DI
   593  	CMPL	SI, DI
   594  	JEQ	noswitch
   595  	CALL	gosave<>(SB)
   596  	get_tls(CX)
   597  	MOVL	SI, g(CX)
   598  	MOVL	(g_sched+gobuf_sp)(SI), SP
   599  
   600  noswitch:
   601  	// Now on a scheduling stack (a pthread-created stack).
   602  	SUBL	$32, SP
   603  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   604  	MOVL	DI, 8(SP)	// save g
   605  	MOVL	(g_stack+stack_hi)(DI), DI
   606  	SUBL	DX, DI
   607  	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
   608  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   609  	CALL	AX
   610  
   611  	// Restore registers, g, stack pointer.
   612  	get_tls(CX)
   613  	MOVL	8(SP), DI
   614  	MOVL	(g_stack+stack_hi)(DI), SI
   615  	SUBL	4(SP), SI
   616  	MOVL	DI, g(CX)
   617  	MOVL	SI, SP
   618  
   619  	MOVL	AX, ret+8(FP)
   620  	RET
   621  
   622  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
   623  // Turn the fn into a Go func (by taking its address) and call
   624  // cgocallback_gofunc.
   625  TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
   626  	LEAL	fn+0(FP), AX
   627  	MOVL	AX, 0(SP)
   628  	MOVL	frame+4(FP), AX
   629  	MOVL	AX, 4(SP)
   630  	MOVL	framesize+8(FP), AX
   631  	MOVL	AX, 8(SP)
   632  	MOVL	ctxt+12(FP), AX
   633  	MOVL	AX, 12(SP)
   634  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   635  	CALL	AX
   636  	RET
   637  
   638  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
   639  // See cgocall.go for more details.
   640  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
   641  	NO_LOCAL_POINTERS
   642  
   643  	// If g is nil, Go did not create the current thread.
   644  	// Call needm to obtain one for temporary use.
   645  	// In this case, we're running on the thread stack, so there's
   646  	// lots of space, but the linker doesn't know. Hide the call from
   647  	// the linker analysis by using an indirect call through AX.
   648  	get_tls(CX)
   649  #ifdef GOOS_windows
   650  	MOVL	$0, BP
   651  	CMPL	CX, $0
   652  	JEQ	2(PC) // TODO
   653  #endif
   654  	MOVL	g(CX), BP
   655  	CMPL	BP, $0
   656  	JEQ	needm
   657  	MOVL	g_m(BP), BP
   658  	MOVL	BP, DX // saved copy of oldm
   659  	JMP	havem
   660  needm:
   661  	MOVL	$0, 0(SP)
   662  	MOVL	$runtime·needm(SB), AX
   663  	CALL	AX
   664  	MOVL	0(SP), DX
   665  	get_tls(CX)
   666  	MOVL	g(CX), BP
   667  	MOVL	g_m(BP), BP
   668  
   669  	// Set m->sched.sp = SP, so that if a panic happens
   670  	// during the function we are about to execute, it will
   671  	// have a valid SP to run on the g0 stack.
   672  	// The next few lines (after the havem label)
   673  	// will save this SP onto the stack and then write
   674  	// the same SP back to m->sched.sp. That seems redundant,
   675  	// but if an unrecovered panic happens, unwindm will
   676  	// restore the g->sched.sp from the stack location
   677  	// and then systemstack will try to use it. If we don't set it here,
   678  	// that restored SP will be uninitialized (typically 0) and
   679  	// will not be usable.
   680  	MOVL	m_g0(BP), SI
   681  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   682  
   683  havem:
   684  	// Now there's a valid m, and we're running on its m->g0.
   685  	// Save current m->g0->sched.sp on stack and then set it to SP.
   686  	// Save current sp in m->g0->sched.sp in preparation for
   687  	// switch back to m->curg stack.
   688  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   689  	MOVL	m_g0(BP), SI
   690  	MOVL	(g_sched+gobuf_sp)(SI), AX
   691  	MOVL	AX, 0(SP)
   692  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   693  
   694  	// Switch to m->curg stack and call runtime.cgocallbackg.
   695  	// Because we are taking over the execution of m->curg
   696  	// but *not* resuming what had been running, we need to
   697  	// save that information (m->curg->sched) so we can restore it.
   698  	// We can restore m->curg->sched.sp easily, because calling
   699  	// runtime.cgocallbackg leaves SP unchanged upon return.
   700  	// To save m->curg->sched.pc, we push it onto the stack.
   701  	// This has the added benefit that it looks to the traceback
   702  	// routine like cgocallbackg is going to return to that
   703  	// PC (because the frame we allocate below has the same
   704  	// size as cgocallback_gofunc's frame declared above)
   705  	// so that the traceback will seamlessly trace back into
   706  	// the earlier calls.
   707  	//
   708  	// In the new goroutine, 4(SP) holds the saved oldm (DX) register.
   709  	// 8(SP) is unused.
   710  	MOVL	m_curg(BP), SI
   711  	MOVL	SI, g(CX)
   712  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   713  	MOVL	(g_sched+gobuf_pc)(SI), BP
   714  	MOVL	BP, -4(DI)
   715  	MOVL	ctxt+12(FP), CX
   716  	LEAL	-(4+12)(DI), SP
   717  	MOVL	DX, 4(SP)
   718  	MOVL	CX, 0(SP)
   719  	CALL	runtime·cgocallbackg(SB)
   720  	MOVL	4(SP), DX
   721  
   722  	// Restore g->sched (== m->curg->sched) from saved values.
   723  	get_tls(CX)
   724  	MOVL	g(CX), SI
   725  	MOVL	12(SP), BP
   726  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   727  	LEAL	(12+4)(SP), DI
   728  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   729  
   730  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   731  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   732  	// so we do not have to restore it.)
   733  	MOVL	g(CX), BP
   734  	MOVL	g_m(BP), BP
   735  	MOVL	m_g0(BP), SI
   736  	MOVL	SI, g(CX)
   737  	MOVL	(g_sched+gobuf_sp)(SI), SP
   738  	MOVL	0(SP), AX
   739  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   740  	
   741  	// If the m on entry was nil, we called needm above to borrow an m
   742  	// for the duration of the call. Since the call is over, return it with dropm.
   743  	CMPL	DX, $0
   744  	JNE 3(PC)
   745  	MOVL	$runtime·dropm(SB), AX
   746  	CALL	AX
   747  
   748  	// Done!
   749  	RET
   750  
   751  // void setg(G*); set g. for use by needm.
   752  TEXT runtime·setg(SB), NOSPLIT, $0-4
   753  	MOVL	gg+0(FP), BX
   754  #ifdef GOOS_windows
   755  	CMPL	BX, $0
   756  	JNE	settls
   757  	MOVL	$0, 0x14(FS)
   758  	RET
   759  settls:
   760  	MOVL	g_m(BX), AX
   761  	LEAL	m_tls(AX), AX
   762  	MOVL	AX, 0x14(FS)
   763  #endif
   764  	get_tls(CX)
   765  	MOVL	BX, g(CX)
   766  	RET
   767  
   768  // void setg_gcc(G*); set g. for use by gcc
   769  TEXT setg_gcc<>(SB), NOSPLIT, $0
   770  	get_tls(AX)
   771  	MOVL	gg+0(FP), DX
   772  	MOVL	DX, g(AX)
   773  	RET
   774  
   775  // check that SP is in range [g->stack.lo, g->stack.hi)
   776  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   777  	get_tls(CX)
   778  	MOVL	g(CX), AX
   779  	CMPL	(g_stack+stack_hi)(AX), SP
   780  	JHI	2(PC)
   781  	INT	$3
   782  	CMPL	SP, (g_stack+stack_lo)(AX)
   783  	JHI	2(PC)
   784  	INT	$3
   785  	RET
   786  
   787  TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
   788  	MOVL	argp+0(FP),AX		// addr of first arg
   789  	MOVL	-4(AX),AX		// get calling pc
   790  	CMPL	AX, runtime·stackBarrierPC(SB)
   791  	JNE	nobar
   792  	// Get original return PC.
   793  	CALL	runtime·nextBarrierPC(SB)
   794  	MOVL	0(SP), AX
   795  nobar:
   796  	MOVL	AX, ret+4(FP)
   797  	RET
   798  
   799  TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
   800  	MOVL	argp+0(FP),AX		// addr of first arg
   801  	MOVL	pc+4(FP), BX
   802  	MOVL	-4(AX), DX
   803  	CMPL	DX, runtime·stackBarrierPC(SB)
   804  	JEQ	setbar
   805  	MOVL	BX, -4(AX)		// set calling pc
   806  	RET
   807  setbar:
   808  	// Set the stack barrier return PC.
   809  	MOVL	BX, 0(SP)
   810  	CALL	runtime·setNextBarrierPC(SB)
   811  	RET
   812  
   813  TEXT runtime·getcallersp(SB), NOSPLIT, $0-8
   814  	MOVL	argp+0(FP), AX
   815  	MOVL	AX, ret+4(FP)
   816  	RET
   817  
   818  // func cputicks() int64
   819  TEXT runtime·cputicks(SB),NOSPLIT,$0-8
   820  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence
   821  	JEQ	done
   822  	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
   823  	JNE	mfence
   824  	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
   825  	JMP	done
   826  mfence:
   827  	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
   828  done:
   829  	RDTSC
   830  	MOVL	AX, ret_lo+0(FP)
   831  	MOVL	DX, ret_hi+4(FP)
   832  	RET
   833  
   834  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   835  	// set up ldt 7 to point at m0.tls
   836  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   837  	// the entry number is just a hint.  setldt will set up GS with what it used.
   838  	MOVL	$7, 0(SP)
   839  	LEAL	runtime·m0+m_tls(SB), AX
   840  	MOVL	AX, 4(SP)
   841  	MOVL	$32, 8(SP)	// sizeof(tls array)
   842  	CALL	runtime·setldt(SB)
   843  	RET
   844  
   845  TEXT runtime·emptyfunc(SB),0,$0-0
   846  	RET
   847  
   848  TEXT runtime·abort(SB),NOSPLIT,$0-0
   849  	INT $0x3
   850  
   851  // memhash_varlen(p unsafe.Pointer, h seed) uintptr
   852  // redirects to memhash(p, h, size) using the size
   853  // stored in the closure.
   854  TEXT runtime·memhash_varlen(SB),NOSPLIT,$16-12
   855  	GO_ARGS
   856  	NO_LOCAL_POINTERS
   857  	MOVL	p+0(FP), AX
   858  	MOVL	h+4(FP), BX
   859  	MOVL	4(DX), CX
   860  	MOVL	AX, 0(SP)
   861  	MOVL	BX, 4(SP)
   862  	MOVL	CX, 8(SP)
   863  	CALL	runtime·memhash(SB)
   864  	MOVL	12(SP), AX
   865  	MOVL	AX, ret+8(FP)
   866  	RET
   867  
   868  // hash function using AES hardware instructions
   869  TEXT runtime·aeshash(SB),NOSPLIT,$0-16
   870  	MOVL	p+0(FP), AX	// ptr to data
   871  	MOVL	s+8(FP), BX	// size
   872  	LEAL	ret+12(FP), DX
   873  	JMP	runtime·aeshashbody(SB)
   874  
   875  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   876  	MOVL	p+0(FP), AX	// ptr to string object
   877  	MOVL	4(AX), BX	// length of string
   878  	MOVL	(AX), AX	// string data
   879  	LEAL	ret+8(FP), DX
   880  	JMP	runtime·aeshashbody(SB)
   881  
   882  // AX: data
   883  // BX: length
   884  // DX: address to put return value
   885  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
   886  	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
   887  	PINSRW	$4, BX, X0	            // 16 bits of length
   888  	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
   889  	MOVO	X0, X1                      // save unscrambled seed
   890  	PXOR	runtime·aeskeysched(SB), X0 // xor in per-process seed
   891  	AESENC	X0, X0                      // scramble seed
   892  
   893  	CMPL	BX, $16
   894  	JB	aes0to15
   895  	JE	aes16
   896  	CMPL	BX, $32
   897  	JBE	aes17to32
   898  	CMPL	BX, $64
   899  	JBE	aes33to64
   900  	JMP	aes65plus
   901  	
   902  aes0to15:
   903  	TESTL	BX, BX
   904  	JE	aes0
   905  
   906  	ADDL	$16, AX
   907  	TESTW	$0xff0, AX
   908  	JE	endofpage
   909  
   910  	// 16 bytes loaded at this address won't cross
   911  	// a page boundary, so we can load it directly.
   912  	MOVOU	-16(AX), X1
   913  	ADDL	BX, BX
   914  	PAND	masks<>(SB)(BX*8), X1
   915  
   916  final1:	
   917  	AESENC	X0, X1  // scramble input, xor in seed
   918  	AESENC	X1, X1  // scramble combo 2 times
   919  	AESENC	X1, X1
   920  	MOVL	X1, (DX)
   921  	RET
   922  
   923  endofpage:
   924  	// address ends in 1111xxxx. Might be up against
   925  	// a page boundary, so load ending at last byte.
   926  	// Then shift bytes down using pshufb.
   927  	MOVOU	-32(AX)(BX*1), X1
   928  	ADDL	BX, BX
   929  	PSHUFB	shifts<>(SB)(BX*8), X1
   930  	JMP	final1
   931  
   932  aes0:
   933  	// Return scrambled input seed
   934  	AESENC	X0, X0
   935  	MOVL	X0, (DX)
   936  	RET
   937  
   938  aes16:
   939  	MOVOU	(AX), X1
   940  	JMP	final1
   941  
   942  aes17to32:
   943  	// make second starting seed
   944  	PXOR	runtime·aeskeysched+16(SB), X1
   945  	AESENC	X1, X1
   946  	
   947  	// load data to be hashed
   948  	MOVOU	(AX), X2
   949  	MOVOU	-16(AX)(BX*1), X3
   950  
   951  	// scramble 3 times
   952  	AESENC	X0, X2
   953  	AESENC	X1, X3
   954  	AESENC	X2, X2
   955  	AESENC	X3, X3
   956  	AESENC	X2, X2
   957  	AESENC	X3, X3
   958  
   959  	// combine results
   960  	PXOR	X3, X2
   961  	MOVL	X2, (DX)
   962  	RET
   963  
   964  aes33to64:
   965  	// make 3 more starting seeds
   966  	MOVO	X1, X2
   967  	MOVO	X1, X3
   968  	PXOR	runtime·aeskeysched+16(SB), X1
   969  	PXOR	runtime·aeskeysched+32(SB), X2
   970  	PXOR	runtime·aeskeysched+48(SB), X3
   971  	AESENC	X1, X1
   972  	AESENC	X2, X2
   973  	AESENC	X3, X3
   974  	
   975  	MOVOU	(AX), X4
   976  	MOVOU	16(AX), X5
   977  	MOVOU	-32(AX)(BX*1), X6
   978  	MOVOU	-16(AX)(BX*1), X7
   979  	
   980  	AESENC	X0, X4
   981  	AESENC	X1, X5
   982  	AESENC	X2, X6
   983  	AESENC	X3, X7
   984  	
   985  	AESENC	X4, X4
   986  	AESENC	X5, X5
   987  	AESENC	X6, X6
   988  	AESENC	X7, X7
   989  	
   990  	AESENC	X4, X4
   991  	AESENC	X5, X5
   992  	AESENC	X6, X6
   993  	AESENC	X7, X7
   994  
   995  	PXOR	X6, X4
   996  	PXOR	X7, X5
   997  	PXOR	X5, X4
   998  	MOVL	X4, (DX)
   999  	RET
  1000  
  1001  aes65plus:
  1002  	// make 3 more starting seeds
  1003  	MOVO	X1, X2
  1004  	MOVO	X1, X3
  1005  	PXOR	runtime·aeskeysched+16(SB), X1
  1006  	PXOR	runtime·aeskeysched+32(SB), X2
  1007  	PXOR	runtime·aeskeysched+48(SB), X3
  1008  	AESENC	X1, X1
  1009  	AESENC	X2, X2
  1010  	AESENC	X3, X3
  1011  	
  1012  	// start with last (possibly overlapping) block
  1013  	MOVOU	-64(AX)(BX*1), X4
  1014  	MOVOU	-48(AX)(BX*1), X5
  1015  	MOVOU	-32(AX)(BX*1), X6
  1016  	MOVOU	-16(AX)(BX*1), X7
  1017  
  1018  	// scramble state once
  1019  	AESENC	X0, X4
  1020  	AESENC	X1, X5
  1021  	AESENC	X2, X6
  1022  	AESENC	X3, X7
  1023  
  1024  	// compute number of remaining 64-byte blocks
  1025  	DECL	BX
  1026  	SHRL	$6, BX
  1027  	
  1028  aesloop:
  1029  	// scramble state, xor in a block
  1030  	MOVOU	(AX), X0
  1031  	MOVOU	16(AX), X1
  1032  	MOVOU	32(AX), X2
  1033  	MOVOU	48(AX), X3
  1034  	AESENC	X0, X4
  1035  	AESENC	X1, X5
  1036  	AESENC	X2, X6
  1037  	AESENC	X3, X7
  1038  
  1039  	// scramble state
  1040  	AESENC	X4, X4
  1041  	AESENC	X5, X5
  1042  	AESENC	X6, X6
  1043  	AESENC	X7, X7
  1044  
  1045  	ADDL	$64, AX
  1046  	DECL	BX
  1047  	JNE	aesloop
  1048  
  1049  	// 2 more scrambles to finish
  1050  	AESENC	X4, X4
  1051  	AESENC	X5, X5
  1052  	AESENC	X6, X6
  1053  	AESENC	X7, X7
  1054  	
  1055  	AESENC	X4, X4
  1056  	AESENC	X5, X5
  1057  	AESENC	X6, X6
  1058  	AESENC	X7, X7
  1059  
  1060  	PXOR	X6, X4
  1061  	PXOR	X7, X5
  1062  	PXOR	X5, X4
  1063  	MOVL	X4, (DX)
  1064  	RET
  1065  
  1066  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
  1067  	MOVL	p+0(FP), AX	// ptr to data
  1068  	MOVL	h+4(FP), X0	// seed
  1069  	PINSRD	$1, (AX), X0	// data
  1070  	AESENC	runtime·aeskeysched+0(SB), X0
  1071  	AESENC	runtime·aeskeysched+16(SB), X0
  1072  	AESENC	runtime·aeskeysched+32(SB), X0
  1073  	MOVL	X0, ret+8(FP)
  1074  	RET
  1075  
  1076  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
  1077  	MOVL	p+0(FP), AX	// ptr to data
  1078  	MOVQ	(AX), X0	// data
  1079  	PINSRD	$2, h+4(FP), X0	// seed
  1080  	AESENC	runtime·aeskeysched+0(SB), X0
  1081  	AESENC	runtime·aeskeysched+16(SB), X0
  1082  	AESENC	runtime·aeskeysched+32(SB), X0
  1083  	MOVL	X0, ret+8(FP)
  1084  	RET
  1085  
  1086  // simple mask to get rid of data in the high part of the register.
  1087  DATA masks<>+0x00(SB)/4, $0x00000000
  1088  DATA masks<>+0x04(SB)/4, $0x00000000
  1089  DATA masks<>+0x08(SB)/4, $0x00000000
  1090  DATA masks<>+0x0c(SB)/4, $0x00000000
  1091  	
  1092  DATA masks<>+0x10(SB)/4, $0x000000ff
  1093  DATA masks<>+0x14(SB)/4, $0x00000000
  1094  DATA masks<>+0x18(SB)/4, $0x00000000
  1095  DATA masks<>+0x1c(SB)/4, $0x00000000
  1096  	
  1097  DATA masks<>+0x20(SB)/4, $0x0000ffff
  1098  DATA masks<>+0x24(SB)/4, $0x00000000
  1099  DATA masks<>+0x28(SB)/4, $0x00000000
  1100  DATA masks<>+0x2c(SB)/4, $0x00000000
  1101  	
  1102  DATA masks<>+0x30(SB)/4, $0x00ffffff
  1103  DATA masks<>+0x34(SB)/4, $0x00000000
  1104  DATA masks<>+0x38(SB)/4, $0x00000000
  1105  DATA masks<>+0x3c(SB)/4, $0x00000000
  1106  	
  1107  DATA masks<>+0x40(SB)/4, $0xffffffff
  1108  DATA masks<>+0x44(SB)/4, $0x00000000
  1109  DATA masks<>+0x48(SB)/4, $0x00000000
  1110  DATA masks<>+0x4c(SB)/4, $0x00000000
  1111  	
  1112  DATA masks<>+0x50(SB)/4, $0xffffffff
  1113  DATA masks<>+0x54(SB)/4, $0x000000ff
  1114  DATA masks<>+0x58(SB)/4, $0x00000000
  1115  DATA masks<>+0x5c(SB)/4, $0x00000000
  1116  	
  1117  DATA masks<>+0x60(SB)/4, $0xffffffff
  1118  DATA masks<>+0x64(SB)/4, $0x0000ffff
  1119  DATA masks<>+0x68(SB)/4, $0x00000000
  1120  DATA masks<>+0x6c(SB)/4, $0x00000000
  1121  	
  1122  DATA masks<>+0x70(SB)/4, $0xffffffff
  1123  DATA masks<>+0x74(SB)/4, $0x00ffffff
  1124  DATA masks<>+0x78(SB)/4, $0x00000000
  1125  DATA masks<>+0x7c(SB)/4, $0x00000000
  1126  	
  1127  DATA masks<>+0x80(SB)/4, $0xffffffff
  1128  DATA masks<>+0x84(SB)/4, $0xffffffff
  1129  DATA masks<>+0x88(SB)/4, $0x00000000
  1130  DATA masks<>+0x8c(SB)/4, $0x00000000
  1131  	
  1132  DATA masks<>+0x90(SB)/4, $0xffffffff
  1133  DATA masks<>+0x94(SB)/4, $0xffffffff
  1134  DATA masks<>+0x98(SB)/4, $0x000000ff
  1135  DATA masks<>+0x9c(SB)/4, $0x00000000
  1136  	
  1137  DATA masks<>+0xa0(SB)/4, $0xffffffff
  1138  DATA masks<>+0xa4(SB)/4, $0xffffffff
  1139  DATA masks<>+0xa8(SB)/4, $0x0000ffff
  1140  DATA masks<>+0xac(SB)/4, $0x00000000
  1141  	
  1142  DATA masks<>+0xb0(SB)/4, $0xffffffff
  1143  DATA masks<>+0xb4(SB)/4, $0xffffffff
  1144  DATA masks<>+0xb8(SB)/4, $0x00ffffff
  1145  DATA masks<>+0xbc(SB)/4, $0x00000000
  1146  	
  1147  DATA masks<>+0xc0(SB)/4, $0xffffffff
  1148  DATA masks<>+0xc4(SB)/4, $0xffffffff
  1149  DATA masks<>+0xc8(SB)/4, $0xffffffff
  1150  DATA masks<>+0xcc(SB)/4, $0x00000000
  1151  	
  1152  DATA masks<>+0xd0(SB)/4, $0xffffffff
  1153  DATA masks<>+0xd4(SB)/4, $0xffffffff
  1154  DATA masks<>+0xd8(SB)/4, $0xffffffff
  1155  DATA masks<>+0xdc(SB)/4, $0x000000ff
  1156  	
  1157  DATA masks<>+0xe0(SB)/4, $0xffffffff
  1158  DATA masks<>+0xe4(SB)/4, $0xffffffff
  1159  DATA masks<>+0xe8(SB)/4, $0xffffffff
  1160  DATA masks<>+0xec(SB)/4, $0x0000ffff
  1161  	
  1162  DATA masks<>+0xf0(SB)/4, $0xffffffff
  1163  DATA masks<>+0xf4(SB)/4, $0xffffffff
  1164  DATA masks<>+0xf8(SB)/4, $0xffffffff
  1165  DATA masks<>+0xfc(SB)/4, $0x00ffffff
  1166  
  1167  GLOBL masks<>(SB),RODATA,$256
  1168  
  1169  // these are arguments to pshufb. They move data down from
  1170  // the high bytes of the register to the low bytes of the register.
  1171  // index is how many bytes to move.
  1172  DATA shifts<>+0x00(SB)/4, $0x00000000
  1173  DATA shifts<>+0x04(SB)/4, $0x00000000
  1174  DATA shifts<>+0x08(SB)/4, $0x00000000
  1175  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1176  	
  1177  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1178  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1179  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1180  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1181  	
  1182  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1183  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1184  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1185  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1186  	
  1187  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1188  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1189  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1190  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1191  	
  1192  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1193  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1194  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1195  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1196  	
  1197  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1198  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1199  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1200  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1201  	
  1202  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1203  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1204  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1205  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1206  	
  1207  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1208  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1209  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1210  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1211  	
  1212  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1213  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1214  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1215  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1216  	
  1217  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1218  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1219  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1220  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1221  	
  1222  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1223  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1224  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1225  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1226  	
  1227  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1228  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1229  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1230  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1231  	
  1232  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1233  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1234  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1235  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1236  	
  1237  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1238  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1239  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1240  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1241  	
  1242  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1243  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1244  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1245  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1246  	
  1247  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1248  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1249  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1250  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1251  
  1252  GLOBL shifts<>(SB),RODATA,$256
  1253  
  1254  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1255  	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
  1256  	MOVL	$masks<>(SB), AX
  1257  	MOVL	$shifts<>(SB), BX
  1258  	ORL	BX, AX
  1259  	TESTL	$15, AX
  1260  	SETEQ	ret+0(FP)
  1261  	RET
  1262  
  1263  // memequal(p, q unsafe.Pointer, size uintptr) bool
  1264  TEXT runtime·memequal(SB),NOSPLIT,$0-13
  1265  	MOVL	a+0(FP), SI
  1266  	MOVL	b+4(FP), DI
  1267  	CMPL	SI, DI
  1268  	JEQ	eq
  1269  	MOVL	size+8(FP), BX
  1270  	LEAL	ret+12(FP), AX
  1271  	JMP	runtime·memeqbody(SB)
  1272  eq:
  1273  	MOVB    $1, ret+12(FP)
  1274  	RET
  1275  
  1276  // memequal_varlen(a, b unsafe.Pointer) bool
  1277  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
  1278  	MOVL    a+0(FP), SI
  1279  	MOVL    b+4(FP), DI
  1280  	CMPL    SI, DI
  1281  	JEQ     eq
  1282  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
  1283  	LEAL	ret+8(FP), AX
  1284  	JMP	runtime·memeqbody(SB)
  1285  eq:
  1286  	MOVB    $1, ret+8(FP)
  1287  	RET
  1288  
  1289  // eqstring tests whether two strings are equal.
  1290  // The compiler guarantees that strings passed
  1291  // to eqstring have equal length.
  1292  // See runtime_test.go:eqstring_generic for
  1293  // equivalent Go code.
  1294  TEXT runtime·eqstring(SB),NOSPLIT,$0-17
  1295  	MOVL	s1str+0(FP), SI
  1296  	MOVL	s2str+8(FP), DI
  1297  	CMPL	SI, DI
  1298  	JEQ	same
  1299  	MOVL	s1len+4(FP), BX
  1300  	LEAL	v+16(FP), AX
  1301  	JMP	runtime·memeqbody(SB)
  1302  same:
  1303  	MOVB	$1, v+16(FP)
  1304  	RET
  1305  
  1306  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1307  	MOVL	a_len+4(FP), BX
  1308  	MOVL	b_len+16(FP), CX
  1309  	CMPL	BX, CX
  1310  	JNE	eqret
  1311  	MOVL	a+0(FP), SI
  1312  	MOVL	b+12(FP), DI
  1313  	LEAL	ret+24(FP), AX
  1314  	JMP	runtime·memeqbody(SB)
  1315  eqret:
  1316  	MOVB	$0, ret+24(FP)
  1317  	RET
  1318  
  1319  // a in SI
  1320  // b in DI
  1321  // count in BX
  1322  // address of result byte in AX
  1323  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1324  	CMPL	BX, $4
  1325  	JB	small
  1326  
  1327  	// 64 bytes at a time using xmm registers
  1328  hugeloop:
  1329  	CMPL	BX, $64
  1330  	JB	bigloop
  1331  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1332  	JE	bigloop
  1333  	MOVOU	(SI), X0
  1334  	MOVOU	(DI), X1
  1335  	MOVOU	16(SI), X2
  1336  	MOVOU	16(DI), X3
  1337  	MOVOU	32(SI), X4
  1338  	MOVOU	32(DI), X5
  1339  	MOVOU	48(SI), X6
  1340  	MOVOU	48(DI), X7
  1341  	PCMPEQB	X1, X0
  1342  	PCMPEQB	X3, X2
  1343  	PCMPEQB	X5, X4
  1344  	PCMPEQB	X7, X6
  1345  	PAND	X2, X0
  1346  	PAND	X6, X4
  1347  	PAND	X4, X0
  1348  	PMOVMSKB X0, DX
  1349  	ADDL	$64, SI
  1350  	ADDL	$64, DI
  1351  	SUBL	$64, BX
  1352  	CMPL	DX, $0xffff
  1353  	JEQ	hugeloop
  1354  	MOVB	$0, (AX)
  1355  	RET
  1356  
  1357  	// 4 bytes at a time using 32-bit register
  1358  bigloop:
  1359  	CMPL	BX, $4
  1360  	JBE	leftover
  1361  	MOVL	(SI), CX
  1362  	MOVL	(DI), DX
  1363  	ADDL	$4, SI
  1364  	ADDL	$4, DI
  1365  	SUBL	$4, BX
  1366  	CMPL	CX, DX
  1367  	JEQ	bigloop
  1368  	MOVB	$0, (AX)
  1369  	RET
  1370  
  1371  	// remaining 0-4 bytes
  1372  leftover:
  1373  	MOVL	-4(SI)(BX*1), CX
  1374  	MOVL	-4(DI)(BX*1), DX
  1375  	CMPL	CX, DX
  1376  	SETEQ	(AX)
  1377  	RET
  1378  
  1379  small:
  1380  	CMPL	BX, $0
  1381  	JEQ	equal
  1382  
  1383  	LEAL	0(BX*8), CX
  1384  	NEGL	CX
  1385  
  1386  	MOVL	SI, DX
  1387  	CMPB	DX, $0xfc
  1388  	JA	si_high
  1389  
  1390  	// load at SI won't cross a page boundary.
  1391  	MOVL	(SI), SI
  1392  	JMP	si_finish
  1393  si_high:
  1394  	// address ends in 111111xx. Load up to bytes we want, move to correct position.
  1395  	MOVL	-4(SI)(BX*1), SI
  1396  	SHRL	CX, SI
  1397  si_finish:
  1398  
  1399  	// same for DI.
  1400  	MOVL	DI, DX
  1401  	CMPB	DX, $0xfc
  1402  	JA	di_high
  1403  	MOVL	(DI), DI
  1404  	JMP	di_finish
  1405  di_high:
  1406  	MOVL	-4(DI)(BX*1), DI
  1407  	SHRL	CX, DI
  1408  di_finish:
  1409  
  1410  	SUBL	SI, DI
  1411  	SHLL	CX, DI
  1412  equal:
  1413  	SETEQ	(AX)
  1414  	RET
  1415  
  1416  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1417  	MOVL	s1_base+0(FP), SI
  1418  	MOVL	s1_len+4(FP), BX
  1419  	MOVL	s2_base+8(FP), DI
  1420  	MOVL	s2_len+12(FP), DX
  1421  	LEAL	ret+16(FP), AX
  1422  	JMP	runtime·cmpbody(SB)
  1423  
  1424  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1425  	MOVL	s1+0(FP), SI
  1426  	MOVL	s1+4(FP), BX
  1427  	MOVL	s2+12(FP), DI
  1428  	MOVL	s2+16(FP), DX
  1429  	LEAL	ret+24(FP), AX
  1430  	JMP	runtime·cmpbody(SB)
  1431  
  1432  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
  1433  	MOVL	s+0(FP), SI
  1434  	MOVL	s_len+4(FP), CX
  1435  	MOVB	c+12(FP), AL
  1436  	MOVL	SI, DI
  1437  	CLD; REPN; SCASB
  1438  	JZ 3(PC)
  1439  	MOVL	$-1, ret+16(FP)
  1440  	RET
  1441  	SUBL	SI, DI
  1442  	SUBL	$1, DI
  1443  	MOVL	DI, ret+16(FP)
  1444  	RET
  1445  
  1446  TEXT strings·IndexByte(SB),NOSPLIT,$0-16
  1447  	MOVL	s+0(FP), SI
  1448  	MOVL	s_len+4(FP), CX
  1449  	MOVB	c+8(FP), AL
  1450  	MOVL	SI, DI
  1451  	CLD; REPN; SCASB
  1452  	JZ 3(PC)
  1453  	MOVL	$-1, ret+12(FP)
  1454  	RET
  1455  	SUBL	SI, DI
  1456  	SUBL	$1, DI
  1457  	MOVL	DI, ret+12(FP)
  1458  	RET
  1459  
  1460  // input:
  1461  //   SI = a
  1462  //   DI = b
  1463  //   BX = alen
  1464  //   DX = blen
  1465  //   AX = address of return word (set to 1/0/-1)
  1466  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1467  	MOVL	DX, BP
  1468  	SUBL	BX, DX // DX = blen-alen
  1469  	JLE	2(PC)
  1470  	MOVL	BX, BP // BP = min(alen, blen)
  1471  	CMPL	SI, DI
  1472  	JEQ	allsame
  1473  	CMPL	BP, $4
  1474  	JB	small
  1475  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1476  	JE	mediumloop
  1477  largeloop:
  1478  	CMPL	BP, $16
  1479  	JB	mediumloop
  1480  	MOVOU	(SI), X0
  1481  	MOVOU	(DI), X1
  1482  	PCMPEQB X0, X1
  1483  	PMOVMSKB X1, BX
  1484  	XORL	$0xffff, BX	// convert EQ to NE
  1485  	JNE	diff16	// branch if at least one byte is not equal
  1486  	ADDL	$16, SI
  1487  	ADDL	$16, DI
  1488  	SUBL	$16, BP
  1489  	JMP	largeloop
  1490  
  1491  diff16:
  1492  	BSFL	BX, BX	// index of first byte that differs
  1493  	XORL	DX, DX
  1494  	MOVB	(SI)(BX*1), CX
  1495  	CMPB	CX, (DI)(BX*1)
  1496  	SETHI	DX
  1497  	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
  1498  	MOVL	DX, (AX)
  1499  	RET
  1500  
  1501  mediumloop:
  1502  	CMPL	BP, $4
  1503  	JBE	_0through4
  1504  	MOVL	(SI), BX
  1505  	MOVL	(DI), CX
  1506  	CMPL	BX, CX
  1507  	JNE	diff4
  1508  	ADDL	$4, SI
  1509  	ADDL	$4, DI
  1510  	SUBL	$4, BP
  1511  	JMP	mediumloop
  1512  
  1513  _0through4:
  1514  	MOVL	-4(SI)(BP*1), BX
  1515  	MOVL	-4(DI)(BP*1), CX
  1516  	CMPL	BX, CX
  1517  	JEQ	allsame
  1518  
  1519  diff4:
  1520  	BSWAPL	BX	// reverse order of bytes
  1521  	BSWAPL	CX
  1522  	XORL	BX, CX	// find bit differences
  1523  	BSRL	CX, CX	// index of highest bit difference
  1524  	SHRL	CX, BX	// move a's bit to bottom
  1525  	ANDL	$1, BX	// mask bit
  1526  	LEAL	-1(BX*2), BX // 1/0 => +1/-1
  1527  	MOVL	BX, (AX)
  1528  	RET
  1529  
  1530  	// 0-3 bytes in common
  1531  small:
  1532  	LEAL	(BP*8), CX
  1533  	NEGL	CX
  1534  	JEQ	allsame
  1535  
  1536  	// load si
  1537  	CMPB	SI, $0xfc
  1538  	JA	si_high
  1539  	MOVL	(SI), SI
  1540  	JMP	si_finish
  1541  si_high:
  1542  	MOVL	-4(SI)(BP*1), SI
  1543  	SHRL	CX, SI
  1544  si_finish:
  1545  	SHLL	CX, SI
  1546  
  1547  	// same for di
  1548  	CMPB	DI, $0xfc
  1549  	JA	di_high
  1550  	MOVL	(DI), DI
  1551  	JMP	di_finish
  1552  di_high:
  1553  	MOVL	-4(DI)(BP*1), DI
  1554  	SHRL	CX, DI
  1555  di_finish:
  1556  	SHLL	CX, DI
  1557  
  1558  	BSWAPL	SI	// reverse order of bytes
  1559  	BSWAPL	DI
  1560  	XORL	SI, DI	// find bit differences
  1561  	JEQ	allsame
  1562  	BSRL	DI, CX	// index of highest bit difference
  1563  	SHRL	CX, SI	// move a's bit to bottom
  1564  	ANDL	$1, SI	// mask bit
  1565  	LEAL	-1(SI*2), BX // 1/0 => +1/-1
  1566  	MOVL	BX, (AX)
  1567  	RET
  1568  
  1569  	// all the bytes in common are the same, so we just need
  1570  	// to compare the lengths.
  1571  allsame:
  1572  	XORL	BX, BX
  1573  	XORL	CX, CX
  1574  	TESTL	DX, DX
  1575  	SETLT	BX	// 1 if alen > blen
  1576  	SETEQ	CX	// 1 if alen == blen
  1577  	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
  1578  	MOVL	BX, (AX)
  1579  	RET
  1580  
  1581  TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
  1582  	get_tls(CX)
  1583  	MOVL	g(CX), AX
  1584  	MOVL	g_m(AX), AX
  1585  	MOVL	m_fastrand(AX), DX
  1586  	ADDL	DX, DX
  1587  	MOVL	DX, BX
  1588  	XORL	$0x88888eef, DX
  1589  	JPL	2(PC)
  1590  	MOVL	BX, DX
  1591  	MOVL	DX, m_fastrand(AX)
  1592  	MOVL	DX, ret+0(FP)
  1593  	RET
  1594  
  1595  TEXT runtime·return0(SB), NOSPLIT, $0
  1596  	MOVL	$0, AX
  1597  	RET
  1598  
  1599  // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
  1600  // Must obey the gcc calling convention.
  1601  TEXT _cgo_topofstack(SB),NOSPLIT,$0
  1602  	get_tls(CX)
  1603  	MOVL	g(CX), AX
  1604  	MOVL	g_m(AX), AX
  1605  	MOVL	m_curg(AX), AX
  1606  	MOVL	(g_stack+stack_hi)(AX), AX
  1607  	RET
  1608  
  1609  // The top-most function running on a goroutine
  1610  // returns to goexit+PCQuantum.
  1611  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1612  	BYTE	$0x90	// NOP
  1613  	CALL	runtime·goexit1(SB)	// does not return
  1614  	// traceback from goexit1 must hit code range of goexit
  1615  	BYTE	$0x90	// NOP
  1616  
  1617  // Prefetching doesn't seem to help.
  1618  TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
  1619  	RET
  1620  
  1621  TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
  1622  	RET
  1623  
  1624  TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
  1625  	RET
  1626  
  1627  TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
  1628  	RET
  1629  
  1630  // Add a module's moduledata to the linked list of moduledata objects. This
  1631  // is called from .init_array by a function generated in the linker and so
  1632  // follows the platform ABI wrt register preservation -- it only touches AX,
  1633  // CX (implicitly) and DX, but it does not follow the ABI wrt arguments:
  1634  // instead the pointer to the moduledata is passed in AX.
  1635  TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
  1636         MOVL    runtime·lastmoduledatap(SB), DX
  1637         MOVL    AX, moduledata_next(DX)
  1638         MOVL    AX, runtime·lastmoduledatap(SB)
  1639         RET