github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/src/pkg/runtime/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "zasm_GOOS_GOARCH.h"
     6  #include "funcdata.h"
     7  #include "../../cmd/ld/textflag.h"
     8  
     9  TEXT _rt0_go(SB),NOSPLIT,$0
    10  	// copy arguments forward on an even stack
    11  	MOVL	argc+0(FP), AX
    12  	MOVL	argv+4(FP), BX
    13  	SUBL	$128, SP		// plenty of scratch
    14  	ANDL	$~15, SP
    15  	MOVL	AX, 120(SP)		// save argc, argv away
    16  	MOVL	BX, 124(SP)
    17  
    18  	// set default stack bounds.
    19  	// _cgo_init may update stackguard.
    20  	MOVL	$runtime·g0(SB), BP
    21  	LEAL	(-64*1024+104)(SP), BX
    22  	MOVL	BX, g_stackguard(BP)
    23  	MOVL	BX, g_stackguard0(BP)
    24  	MOVL	SP, g_stackbase(BP)
    25  	
    26  	// find out information about the processor we're on
    27  	MOVL	$0, AX
    28  	CPUID
    29  	CMPL	AX, $0
    30  	JE	nocpuinfo
    31  	MOVL	$1, AX
    32  	CPUID
    33  	MOVL	CX, runtime·cpuid_ecx(SB)
    34  	MOVL	DX, runtime·cpuid_edx(SB)
    35  nocpuinfo:	
    36  
    37  	// if there is an _cgo_init, call it to let it
    38  	// initialize and to set up GS.  if not,
    39  	// we set up GS ourselves.
    40  	MOVL	_cgo_init(SB), AX
    41  	TESTL	AX, AX
    42  	JZ	needtls
    43  	MOVL	$setmg_gcc<>(SB), BX
    44  	MOVL	BX, 4(SP)
    45  	MOVL	BP, 0(SP)
    46  	CALL	AX
    47  	// update stackguard after _cgo_init
    48  	MOVL	$runtime·g0(SB), CX
    49  	MOVL	g_stackguard0(CX), AX
    50  	MOVL	AX, g_stackguard(CX)
    51  	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
    52  	CMPL runtime·iswindows(SB), $0
    53  	JEQ ok
    54  needtls:
    55  	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
    56  	CMPL	runtime·isplan9(SB), $1
    57  	JEQ	ok
    58  
    59  	// set up %gs
    60  	CALL	runtime·ldt0setup(SB)
    61  
    62  	// store through it, to make sure it works
    63  	get_tls(BX)
    64  	MOVL	$0x123, g(BX)
    65  	MOVL	runtime·tls0(SB), AX
    66  	CMPL	AX, $0x123
    67  	JEQ	ok
    68  	MOVL	AX, 0	// abort
    69  ok:
    70  	// set up m and g "registers"
    71  	get_tls(BX)
    72  	LEAL	runtime·g0(SB), CX
    73  	MOVL	CX, g(BX)
    74  	LEAL	runtime·m0(SB), AX
    75  	MOVL	AX, m(BX)
    76  
    77  	// save m->g0 = g0
    78  	MOVL	CX, m_g0(AX)
    79  
    80  	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
    81  
    82  	// convention is D is always cleared
    83  	CLD
    84  
    85  	CALL	runtime·check(SB)
    86  
    87  	// saved argc, argv
    88  	MOVL	120(SP), AX
    89  	MOVL	AX, 0(SP)
    90  	MOVL	124(SP), AX
    91  	MOVL	AX, 4(SP)
    92  	CALL	runtime·args(SB)
    93  	CALL	runtime·osinit(SB)
    94  	CALL	runtime·hashinit(SB)
    95  	CALL	runtime·schedinit(SB)
    96  
    97  	// create a new goroutine to start program
    98  	PUSHL	$runtime·main·f(SB)	// entry
    99  	PUSHL	$0	// arg size
   100  	ARGSIZE(8)
   101  	CALL	runtime·newproc(SB)
   102  	ARGSIZE(-1)
   103  	POPL	AX
   104  	POPL	AX
   105  
   106  	// start this M
   107  	CALL	runtime·mstart(SB)
   108  
   109  	INT $3
   110  	RET
   111  
   112  DATA	runtime·main·f+0(SB)/4,$runtime·main(SB)
   113  GLOBL	runtime·main·f(SB),RODATA,$4
   114  
   115  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   116  	INT $3
   117  	RET
   118  
   119  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   120  	// Linux and MinGW start the FPU in extended double precision.
   121  	// Other operating systems use double precision.
   122  	// Change to double precision to match them,
   123  	// and to match other hardware that only has double.
   124  	PUSHL $0x27F
   125  	FLDCW	0(SP)
   126  	POPL AX
   127  	RET
   128  
   129  /*
   130   *  go-routine
   131   */
   132  
   133  // void gosave(Gobuf*)
   134  // save state in Gobuf; setjmp
   135  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   136  	MOVL	4(SP), AX		// gobuf
   137  	LEAL	4(SP), BX		// caller's SP
   138  	MOVL	BX, gobuf_sp(AX)
   139  	MOVL	0(SP), BX		// caller's PC
   140  	MOVL	BX, gobuf_pc(AX)
   141  	MOVL	$0, gobuf_ret(AX)
   142  	MOVL	$0, gobuf_ctxt(AX)
   143  	get_tls(CX)
   144  	MOVL	g(CX), BX
   145  	MOVL	BX, gobuf_g(AX)
   146  	RET
   147  
   148  // void gogo(Gobuf*)
   149  // restore state from Gobuf; longjmp
   150  TEXT runtime·gogo(SB), NOSPLIT, $0-4
   151  	MOVL	4(SP), BX		// gobuf
   152  	MOVL	gobuf_g(BX), DX
   153  	MOVL	0(DX), CX		// make sure g != nil
   154  	get_tls(CX)
   155  	MOVL	DX, g(CX)
   156  	MOVL	gobuf_sp(BX), SP	// restore SP
   157  	MOVL	gobuf_ret(BX), AX
   158  	MOVL	gobuf_ctxt(BX), DX
   159  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   160  	MOVL	$0, gobuf_ret(BX)
   161  	MOVL	$0, gobuf_ctxt(BX)
   162  	MOVL	gobuf_pc(BX), BX
   163  	JMP	BX
   164  
   165  // void mcall(void (*fn)(G*))
   166  // Switch to m->g0's stack, call fn(g).
   167  // Fn must never return.  It should gogo(&g->sched)
   168  // to keep running g.
   169  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   170  	MOVL	fn+0(FP), DI
   171  	
   172  	get_tls(CX)
   173  	MOVL	g(CX), AX	// save state in g->sched
   174  	MOVL	0(SP), BX	// caller's PC
   175  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   176  	LEAL	4(SP), BX	// caller's SP
   177  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   178  	MOVL	AX, (g_sched+gobuf_g)(AX)
   179  
   180  	// switch to m->g0 & its stack, call fn
   181  	MOVL	m(CX), BX
   182  	MOVL	m_g0(BX), SI
   183  	CMPL	SI, AX	// if g == m->g0 call badmcall
   184  	JNE	3(PC)
   185  	MOVL	$runtime·badmcall(SB), AX
   186  	JMP	AX
   187  	MOVL	SI, g(CX)	// g = m->g0
   188  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   189  	PUSHL	AX
   190  	CALL	DI
   191  	POPL	AX
   192  	MOVL	$runtime·badmcall2(SB), AX
   193  	JMP	AX
   194  	RET
   195  
   196  /*
   197   * support for morestack
   198   */
   199  
   200  // Called during function prolog when more stack is needed.
   201  //
   202  // The traceback routines see morestack on a g0 as being
   203  // the top of a stack (for example, morestack calling newstack
   204  // calling the scheduler calling newm calling gc), so we must
   205  // record an argument size. For that purpose, it has no arguments.
   206  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   207  	// Cannot grow scheduler stack (m->g0).
   208  	get_tls(CX)
   209  	MOVL	m(CX), BX
   210  	MOVL	m_g0(BX), SI
   211  	CMPL	g(CX), SI
   212  	JNE	2(PC)
   213  	INT	$3
   214  
   215  	// frame size in DI
   216  	// arg size in AX
   217  	// Save in m.
   218  	MOVL	DI, m_moreframesize(BX)
   219  	MOVL	AX, m_moreargsize(BX)
   220  
   221  	// Called from f.
   222  	// Set m->morebuf to f's caller.
   223  	MOVL	4(SP), DI	// f's caller's PC
   224  	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
   225  	LEAL	8(SP), CX	// f's caller's SP
   226  	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
   227  	MOVL	CX, m_moreargp(BX)
   228  	get_tls(CX)
   229  	MOVL	g(CX), SI
   230  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   231  
   232  	// Set g->sched to context in f.
   233  	MOVL	0(SP), AX	// f's PC
   234  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   235  	MOVL	SI, (g_sched+gobuf_g)(SI)
   236  	LEAL	4(SP), AX	// f's SP
   237  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   238  	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
   239  
   240  	// Call newstack on m->g0's stack.
   241  	MOVL	m_g0(BX), BP
   242  	MOVL	BP, g(CX)
   243  	MOVL	(g_sched+gobuf_sp)(BP), AX
   244  	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
   245  	MOVL	AX, SP
   246  	CALL	runtime·newstack(SB)
   247  	MOVL	$0, 0x1003	// crash if newstack returns
   248  	RET
   249  
   250  // Called from panic.  Mimics morestack,
   251  // reuses stack growth code to create a frame
   252  // with the desired args running the desired function.
   253  //
   254  // func call(fn *byte, arg *byte, argsize uint32).
   255  TEXT runtime·newstackcall(SB), NOSPLIT, $0-12
   256  	get_tls(CX)
   257  	MOVL	m(CX), BX
   258  
   259  	// Save our caller's state as the PC and SP to
   260  	// restore when returning from f.
   261  	MOVL	0(SP), AX	// our caller's PC
   262  	MOVL	AX, (m_morebuf+gobuf_pc)(BX)
   263  	LEAL	4(SP), AX	// our caller's SP
   264  	MOVL	AX, (m_morebuf+gobuf_sp)(BX)
   265  	MOVL	g(CX), AX
   266  	MOVL	AX, (m_morebuf+gobuf_g)(BX)
   267  
   268  	// Save our own state as the PC and SP to restore
   269  	// if this goroutine needs to be restarted.
   270  	MOVL	$runtime·newstackcall(SB), (g_sched+gobuf_pc)(AX)
   271  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   272  
   273  	// Set up morestack arguments to call f on a new stack.
   274  	// We set f's frame size to 1, as a hint to newstack
   275  	// that this is a call from runtime·newstackcall.
   276  	// If it turns out that f needs a larger frame than
   277  	// the default stack, f's usual stack growth prolog will
   278  	// allocate a new segment (and recopy the arguments).
   279  	MOVL	4(SP), AX	// fn
   280  	MOVL	8(SP), DX	// arg frame
   281  	MOVL	12(SP), CX	// arg size
   282  
   283  	MOVL	AX, m_cret(BX)	// f's PC
   284  	MOVL	DX, m_moreargp(BX)	// f's argument pointer
   285  	MOVL	CX, m_moreargsize(BX)	// f's argument size
   286  	MOVL	$1, m_moreframesize(BX)	// f's frame size
   287  
   288  	// Call newstack on m->g0's stack.
   289  	MOVL	m_g0(BX), BP
   290  	get_tls(CX)
   291  	MOVL	BP, g(CX)
   292  	MOVL	(g_sched+gobuf_sp)(BP), SP
   293  	CALL	runtime·newstack(SB)
   294  	MOVL	$0, 0x1103	// crash if newstack returns
   295  	RET
   296  
   297  // reflect·call: call a function with the given argument list
   298  // func call(f *FuncVal, arg *byte, argsize uint32).
   299  // we don't have variable-sized frames, so we use a small number
   300  // of constant-sized-frame functions to encode a few bits of size in the pc.
   301  // Caution: ugly multiline assembly macros in your future!
   302  
   303  #define DISPATCH(NAME,MAXSIZE)		\
   304  	CMPL	CX, $MAXSIZE;		\
   305  	JA	3(PC);			\
   306  	MOVL	$runtime·NAME(SB), AX;	\
   307  	JMP	AX
   308  // Note: can't just "JMP runtime·NAME(SB)" - bad inlining results.
   309  
   310  TEXT reflect·call(SB), NOSPLIT, $0-12
   311  	MOVL	argsize+8(FP), CX
   312  	DISPATCH(call16, 16)
   313  	DISPATCH(call32, 32)
   314  	DISPATCH(call64, 64)
   315  	DISPATCH(call128, 128)
   316  	DISPATCH(call256, 256)
   317  	DISPATCH(call512, 512)
   318  	DISPATCH(call1024, 1024)
   319  	DISPATCH(call2048, 2048)
   320  	DISPATCH(call4096, 4096)
   321  	DISPATCH(call8192, 8192)
   322  	DISPATCH(call16384, 16384)
   323  	DISPATCH(call32768, 32768)
   324  	DISPATCH(call65536, 65536)
   325  	DISPATCH(call131072, 131072)
   326  	DISPATCH(call262144, 262144)
   327  	DISPATCH(call524288, 524288)
   328  	DISPATCH(call1048576, 1048576)
   329  	DISPATCH(call2097152, 2097152)
   330  	DISPATCH(call4194304, 4194304)
   331  	DISPATCH(call8388608, 8388608)
   332  	DISPATCH(call16777216, 16777216)
   333  	DISPATCH(call33554432, 33554432)
   334  	DISPATCH(call67108864, 67108864)
   335  	DISPATCH(call134217728, 134217728)
   336  	DISPATCH(call268435456, 268435456)
   337  	DISPATCH(call536870912, 536870912)
   338  	DISPATCH(call1073741824, 1073741824)
   339  	MOVL	$runtime·badreflectcall(SB), AX
   340  	JMP	AX
   341  
   342  #define CALLFN(NAME,MAXSIZE)			\
   343  TEXT runtime·NAME(SB), 0, $MAXSIZE-12;		\
   344  	/* copy arguments to stack */		\
   345  	MOVL	argptr+4(FP), SI;		\
   346  	MOVL	argsize+8(FP), CX;		\
   347  	MOVL	SP, DI;				\
   348  	REP;MOVSB;				\
   349  	/* call function */			\
   350  	MOVL	f+0(FP), DX;			\
   351  	CALL	(DX);				\
   352  	/* copy return values back */		\
   353  	MOVL	argptr+4(FP), DI;		\
   354  	MOVL	argsize+8(FP), CX;		\
   355  	MOVL	SP, SI;				\
   356  	REP;MOVSB;				\
   357  	RET
   358  
   359  CALLFN(call16, 16)
   360  CALLFN(call32, 32)
   361  CALLFN(call64, 64)
   362  CALLFN(call128, 128)
   363  CALLFN(call256, 256)
   364  CALLFN(call512, 512)
   365  CALLFN(call1024, 1024)
   366  CALLFN(call2048, 2048)
   367  CALLFN(call4096, 4096)
   368  CALLFN(call8192, 8192)
   369  CALLFN(call16384, 16384)
   370  CALLFN(call32768, 32768)
   371  CALLFN(call65536, 65536)
   372  CALLFN(call131072, 131072)
   373  CALLFN(call262144, 262144)
   374  CALLFN(call524288, 524288)
   375  CALLFN(call1048576, 1048576)
   376  CALLFN(call2097152, 2097152)
   377  CALLFN(call4194304, 4194304)
   378  CALLFN(call8388608, 8388608)
   379  CALLFN(call16777216, 16777216)
   380  CALLFN(call33554432, 33554432)
   381  CALLFN(call67108864, 67108864)
   382  CALLFN(call134217728, 134217728)
   383  CALLFN(call268435456, 268435456)
   384  CALLFN(call536870912, 536870912)
   385  CALLFN(call1073741824, 1073741824)
   386  
   387  // Return point when leaving stack.
   388  //
   389  // Lessstack can appear in stack traces for the same reason
   390  // as morestack; in that context, it has 0 arguments.
   391  TEXT runtime·lessstack(SB), NOSPLIT, $0-0
   392  	// Save return value in m->cret
   393  	get_tls(CX)
   394  	MOVL	m(CX), BX
   395  	MOVL	AX, m_cret(BX)
   396  
   397  	// Call oldstack on m->g0's stack.
   398  	MOVL	m_g0(BX), BP
   399  	MOVL	BP, g(CX)
   400  	MOVL	(g_sched+gobuf_sp)(BP), SP
   401  	CALL	runtime·oldstack(SB)
   402  	MOVL	$0, 0x1004	// crash if oldstack returns
   403  	RET
   404  
   405  
   406  // bool cas(int32 *val, int32 old, int32 new)
   407  // Atomically:
   408  //	if(*val == old){
   409  //		*val = new;
   410  //		return 1;
   411  //	}else
   412  //		return 0;
   413  TEXT runtime·cas(SB), NOSPLIT, $0-12
   414  	MOVL	4(SP), BX
   415  	MOVL	8(SP), AX
   416  	MOVL	12(SP), CX
   417  	LOCK
   418  	CMPXCHGL	CX, 0(BX)
   419  	JZ 3(PC)
   420  	MOVL	$0, AX
   421  	RET
   422  	MOVL	$1, AX
   423  	RET
   424  
   425  // bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
   426  // Atomically:
   427  //	if(*val == *old){
   428  //		*val = new;
   429  //		return 1;
   430  //	} else {
   431  //		return 0;
   432  //	}
   433  TEXT runtime·cas64(SB), NOSPLIT, $0-20
   434  	MOVL	4(SP), BP
   435  	MOVL	8(SP), AX
   436  	MOVL	12(SP), DX
   437  	MOVL	16(SP), BX
   438  	MOVL	20(SP), CX
   439  	LOCK
   440  	CMPXCHG8B	0(BP)
   441  	JNZ	cas64_fail
   442  	MOVL	$1, AX
   443  	RET
   444  cas64_fail:
   445  	MOVL	$0, AX
   446  	RET
   447  
   448  // bool casp(void **p, void *old, void *new)
   449  // Atomically:
   450  //	if(*p == old){
   451  //		*p = new;
   452  //		return 1;
   453  //	}else
   454  //		return 0;
   455  TEXT runtime·casp(SB), NOSPLIT, $0-12
   456  	MOVL	4(SP), BX
   457  	MOVL	8(SP), AX
   458  	MOVL	12(SP), CX
   459  	LOCK
   460  	CMPXCHGL	CX, 0(BX)
   461  	JZ 3(PC)
   462  	MOVL	$0, AX
   463  	RET
   464  	MOVL	$1, AX
   465  	RET
   466  
   467  // uint32 xadd(uint32 volatile *val, int32 delta)
   468  // Atomically:
   469  //	*val += delta;
   470  //	return *val;
   471  TEXT runtime·xadd(SB), NOSPLIT, $0-8
   472  	MOVL	4(SP), BX
   473  	MOVL	8(SP), AX
   474  	MOVL	AX, CX
   475  	LOCK
   476  	XADDL	AX, 0(BX)
   477  	ADDL	CX, AX
   478  	RET
   479  
   480  TEXT runtime·xchg(SB), NOSPLIT, $0-8
   481  	MOVL	4(SP), BX
   482  	MOVL	8(SP), AX
   483  	XCHGL	AX, 0(BX)
   484  	RET
   485  
   486  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   487  	MOVL	4(SP), AX
   488  again:
   489  	PAUSE
   490  	SUBL	$1, AX
   491  	JNZ	again
   492  	RET
   493  
   494  TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8
   495  	MOVL	4(SP), BX
   496  	MOVL	8(SP), AX
   497  	XCHGL	AX, 0(BX)
   498  	RET
   499  
   500  TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
   501  	MOVL	4(SP), BX
   502  	MOVL	8(SP), AX
   503  	XCHGL	AX, 0(BX)
   504  	RET
   505  
   506  // uint64 atomicload64(uint64 volatile* addr);
   507  // so actually
   508  // void atomicload64(uint64 *res, uint64 volatile *addr);
   509  TEXT runtime·atomicload64(SB), NOSPLIT, $0-8
   510  	MOVL	4(SP), BX
   511  	MOVL	8(SP), AX
   512  	// MOVQ (%EAX), %MM0
   513  	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
   514  	// MOVQ %MM0, 0(%EBX)
   515  	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
   516  	// EMMS
   517  	BYTE $0x0F; BYTE $0x77
   518  	RET
   519  
   520  // void runtime·atomicstore64(uint64 volatile* addr, uint64 v);
   521  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12
   522  	MOVL	4(SP), AX
   523  	// MOVQ and EMMS were introduced on the Pentium MMX.
   524  	// MOVQ 0x8(%ESP), %MM0
   525  	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
   526  	// MOVQ %MM0, (%EAX)
   527  	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
   528  	// EMMS
   529  	BYTE $0x0F; BYTE $0x77
   530  	// This is essentially a no-op, but it provides required memory fencing.
   531  	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
   532  	MOVL	$0, AX
   533  	LOCK
   534  	XADDL	AX, (SP)
   535  	RET
   536  
   537  // void jmpdefer(fn, sp);
   538  // called from deferreturn.
   539  // 1. pop the caller
   540  // 2. sub 5 bytes from the callers return
   541  // 3. jmp to the argument
   542  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   543  	MOVL	4(SP), DX	// fn
   544  	MOVL	8(SP), BX	// caller sp
   545  	LEAL	-4(BX), SP	// caller sp after CALL
   546  	SUBL	$5, (SP)	// return to CALL again
   547  	MOVL	0(DX), BX
   548  	JMP	BX	// but first run the deferred function
   549  
   550  // Save state of caller into g->sched.
   551  TEXT gosave<>(SB),NOSPLIT,$0
   552  	PUSHL	AX
   553  	PUSHL	BX
   554  	get_tls(BX)
   555  	MOVL	g(BX), BX
   556  	LEAL	arg+0(FP), AX
   557  	MOVL	AX, (g_sched+gobuf_sp)(BX)
   558  	MOVL	-4(AX), AX
   559  	MOVL	AX, (g_sched+gobuf_pc)(BX)
   560  	MOVL	$0, (g_sched+gobuf_ret)(BX)
   561  	MOVL	$0, (g_sched+gobuf_ctxt)(BX)
   562  	POPL	BX
   563  	POPL	AX
   564  	RET
   565  
   566  // asmcgocall(void(*fn)(void*), void *arg)
   567  // Call fn(arg) on the scheduler stack,
   568  // aligned appropriately for the gcc ABI.
   569  // See cgocall.c for more details.
   570  TEXT runtime·asmcgocall(SB),NOSPLIT,$0-8
   571  	MOVL	fn+0(FP), AX
   572  	MOVL	arg+4(FP), BX
   573  	MOVL	SP, DX
   574  
   575  	// Figure out if we need to switch to m->g0 stack.
   576  	// We get called to create new OS threads too, and those
   577  	// come in on the m->g0 stack already.
   578  	get_tls(CX)
   579  	MOVL	m(CX), BP
   580  	MOVL	m_g0(BP), SI
   581  	MOVL	g(CX), DI
   582  	CMPL	SI, DI
   583  	JEQ	4(PC)
   584  	CALL	gosave<>(SB)
   585  	MOVL	SI, g(CX)
   586  	MOVL	(g_sched+gobuf_sp)(SI), SP
   587  
   588  	// Now on a scheduling stack (a pthread-created stack).
   589  	SUBL	$32, SP
   590  	ANDL	$~15, SP	// alignment, perhaps unnecessary
   591  	MOVL	DI, 8(SP)	// save g
   592  	MOVL	DX, 4(SP)	// save SP
   593  	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
   594  	CALL	AX
   595  
   596  	// Restore registers, g, stack pointer.
   597  	get_tls(CX)
   598  	MOVL	8(SP), DI
   599  	MOVL	DI, g(CX)
   600  	MOVL	4(SP), SP
   601  	RET
   602  
   603  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   604  // Turn the fn into a Go func (by taking its address) and call
   605  // cgocallback_gofunc.
   606  TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
   607  	LEAL	fn+0(FP), AX
   608  	MOVL	AX, 0(SP)
   609  	MOVL	frame+4(FP), AX
   610  	MOVL	AX, 4(SP)
   611  	MOVL	framesize+8(FP), AX
   612  	MOVL	AX, 8(SP)
   613  	MOVL	$runtime·cgocallback_gofunc(SB), AX
   614  	CALL	AX
   615  	RET
   616  
   617  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   618  // See cgocall.c for more details.
   619  TEXT runtime·cgocallback_gofunc(SB),NOSPLIT,$12-12
   620  	// If m is nil, Go did not create the current thread.
   621  	// Call needm to obtain one for temporary use.
   622  	// In this case, we're running on the thread stack, so there's
   623  	// lots of space, but the linker doesn't know. Hide the call from
   624  	// the linker analysis by using an indirect call through AX.
   625  	get_tls(CX)
   626  #ifdef GOOS_windows
   627  	MOVL	$0, BP
   628  	CMPL	CX, $0
   629  	JEQ	2(PC)
   630  #endif
   631  	MOVL	m(CX), BP
   632  	MOVL	BP, DX // saved copy of oldm
   633  	CMPL	BP, $0
   634  	JNE	havem
   635  needm:
   636  	MOVL	DX, 0(SP)
   637  	MOVL	$runtime·needm(SB), AX
   638  	CALL	AX
   639  	MOVL	0(SP), DX
   640  	get_tls(CX)
   641  	MOVL	m(CX), BP
   642  
   643  havem:
   644  	// Now there's a valid m, and we're running on its m->g0.
   645  	// Save current m->g0->sched.sp on stack and then set it to SP.
   646  	// Save current sp in m->g0->sched.sp in preparation for
   647  	// switch back to m->curg stack.
   648  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   649  	// On Windows, the SEH is at 4(SP) and 8(SP).
   650  	MOVL	m_g0(BP), SI
   651  	MOVL	(g_sched+gobuf_sp)(SI), AX
   652  	MOVL	AX, 0(SP)
   653  	MOVL	SP, (g_sched+gobuf_sp)(SI)
   654  
   655  	// Switch to m->curg stack and call runtime.cgocallbackg.
   656  	// Because we are taking over the execution of m->curg
   657  	// but *not* resuming what had been running, we need to
   658  	// save that information (m->curg->sched) so we can restore it.
   659  	// We can restore m->curg->sched.sp easily, because calling
   660  	// runtime.cgocallbackg leaves SP unchanged upon return.
   661  	// To save m->curg->sched.pc, we push it onto the stack.
   662  	// This has the added benefit that it looks to the traceback
   663  	// routine like cgocallbackg is going to return to that
   664  	// PC (because the frame we allocate below has the same
   665  	// size as cgocallback_gofunc's frame declared above)
   666  	// so that the traceback will seamlessly trace back into
   667  	// the earlier calls.
   668  	//
   669  	// In the new goroutine, 0(SP) holds the saved oldm (DX) register.
   670  	// 4(SP) and 8(SP) are unused.
   671  	MOVL	m_curg(BP), SI
   672  	MOVL	SI, g(CX)
   673  	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
   674  	MOVL	(g_sched+gobuf_pc)(SI), BP
   675  	MOVL	BP, -4(DI)
   676  	LEAL	-(4+12)(DI), SP
   677  	MOVL	DX, 0(SP)
   678  	CALL	runtime·cgocallbackg(SB)
   679  	MOVL	0(SP), DX
   680  
   681  	// Restore g->sched (== m->curg->sched) from saved values.
   682  	get_tls(CX)
   683  	MOVL	g(CX), SI
   684  	MOVL	12(SP), BP
   685  	MOVL	BP, (g_sched+gobuf_pc)(SI)
   686  	LEAL	(12+4)(SP), DI
   687  	MOVL	DI, (g_sched+gobuf_sp)(SI)
   688  
   689  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   690  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   691  	// so we do not have to restore it.)
   692  	MOVL	m(CX), BP
   693  	MOVL	m_g0(BP), SI
   694  	MOVL	SI, g(CX)
   695  	MOVL	(g_sched+gobuf_sp)(SI), SP
   696  	MOVL	0(SP), AX
   697  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   698  	
   699  	// If the m on entry was nil, we called needm above to borrow an m
   700  	// for the duration of the call. Since the call is over, return it with dropm.
   701  	CMPL	DX, $0
   702  	JNE 3(PC)
   703  	MOVL	$runtime·dropm(SB), AX
   704  	CALL	AX
   705  
   706  	// Done!
   707  	RET
   708  
   709  // void setmg(M*, G*); set m and g. for use by needm.
   710  TEXT runtime·setmg(SB), NOSPLIT, $0-8
   711  #ifdef GOOS_windows
   712  	MOVL	mm+0(FP), AX
   713  	CMPL	AX, $0
   714  	JNE	settls
   715  	MOVL	$0, 0x14(FS)
   716  	RET
   717  settls:
   718  	LEAL	m_tls(AX), AX
   719  	MOVL	AX, 0x14(FS)
   720  #endif
   721  	MOVL	mm+0(FP), AX
   722  	get_tls(CX)
   723  	MOVL	mm+0(FP), AX
   724  	MOVL	AX, m(CX)
   725  	MOVL	gg+4(FP), BX
   726  	MOVL	BX, g(CX)
   727  	RET
   728  
   729  // void setmg_gcc(M*, G*); set m and g. for use by gcc
   730  TEXT setmg_gcc<>(SB), NOSPLIT, $0
   731  	get_tls(AX)
   732  	MOVL	mm+0(FP), DX
   733  	MOVL	DX, m(AX)
   734  	MOVL	gg+4(FP), DX
   735  	MOVL	DX,g (AX)
   736  	RET
   737  
   738  // check that SP is in range [g->stackbase, g->stackguard)
   739  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   740  	get_tls(CX)
   741  	MOVL	g(CX), AX
   742  	CMPL	g_stackbase(AX), SP
   743  	JHI	2(PC)
   744  	INT	$3
   745  	CMPL	SP, g_stackguard(AX)
   746  	JHI	2(PC)
   747  	INT	$3
   748  	RET
   749  
   750  TEXT runtime·memclr(SB),NOSPLIT,$0-8
   751  	MOVL	4(SP), DI		// arg 1 addr
   752  	MOVL	8(SP), CX		// arg 2 count
   753  	MOVL	CX, BX
   754  	ANDL	$3, BX
   755  	SHRL	$2, CX
   756  	MOVL	$0, AX
   757  	CLD
   758  	REP
   759  	STOSL
   760  	MOVL	BX, CX
   761  	REP
   762  	STOSB
   763  	RET
   764  
   765  TEXT runtime·getcallerpc(SB),NOSPLIT,$0-4
   766  	MOVL	x+0(FP),AX		// addr of first arg
   767  	MOVL	-4(AX),AX		// get calling pc
   768  	RET
   769  
   770  TEXT runtime·setcallerpc(SB),NOSPLIT,$0-8
   771  	MOVL	x+0(FP),AX		// addr of first arg
   772  	MOVL	x+4(FP), BX
   773  	MOVL	BX, -4(AX)		// set calling pc
   774  	RET
   775  
   776  TEXT runtime·getcallersp(SB), NOSPLIT, $0-4
   777  	MOVL	sp+0(FP), AX
   778  	RET
   779  
   780  // int64 runtime·cputicks(void), so really
   781  // void runtime·cputicks(int64 *ticks)
   782  TEXT runtime·cputicks(SB),NOSPLIT,$0-4
   783  	RDTSC
   784  	MOVL	ret+0(FP), DI
   785  	MOVL	AX, 0(DI)
   786  	MOVL	DX, 4(DI)
   787  	RET
   788  
   789  TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
   790  	// set up ldt 7 to point at tls0
   791  	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
   792  	// the entry number is just a hint.  setldt will set up GS with what it used.
   793  	MOVL	$7, 0(SP)
   794  	LEAL	runtime·tls0(SB), AX
   795  	MOVL	AX, 4(SP)
   796  	MOVL	$32, 8(SP)	// sizeof(tls array)
   797  	CALL	runtime·setldt(SB)
   798  	RET
   799  
   800  TEXT runtime·emptyfunc(SB),0,$0-0
   801  	RET
   802  
   803  TEXT runtime·abort(SB),NOSPLIT,$0-0
   804  	INT $0x3
   805  
   806  TEXT runtime·stackguard(SB),NOSPLIT,$0-8
   807  	MOVL	SP, DX
   808  	MOVL	DX, sp+0(FP)
   809  	get_tls(CX)
   810  	MOVL	g(CX), BX
   811  	MOVL	g_stackguard(BX), DX
   812  	MOVL	DX, limit+4(FP)
   813  	RET
   814  
   815  GLOBL runtime·tls0(SB), $32
   816  
   817  // hash function using AES hardware instructions
   818  TEXT runtime·aeshash(SB),NOSPLIT,$0-12
   819  	MOVL	4(SP), DX	// ptr to hash value
   820  	MOVL	8(SP), CX	// size
   821  	MOVL	12(SP), AX	// ptr to data
   822  	JMP	runtime·aeshashbody(SB)
   823  
   824  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   825  	MOVL	4(SP), DX	// ptr to hash value
   826  	MOVL	12(SP), AX	// ptr to string struct
   827  	MOVL	4(AX), CX	// length of string
   828  	MOVL	(AX), AX	// string data
   829  	JMP	runtime·aeshashbody(SB)
   830  
   831  // AX: data
   832  // CX: length
   833  // DX: ptr to seed input / hash output
   834  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-12
   835  	MOVL	(DX), X0	// seed to low 32 bits of xmm0
   836  	PINSRD	$1, CX, X0	// size to next 32 bits of xmm0
   837  	MOVO	runtime·aeskeysched+0(SB), X2
   838  	MOVO	runtime·aeskeysched+16(SB), X3
   839  	CMPL	CX, $16
   840  	JB	aessmall
   841  aesloop:
   842  	CMPL	CX, $16
   843  	JBE	aesloopend
   844  	MOVOU	(AX), X1
   845  	AESENC	X2, X0
   846  	AESENC	X1, X0
   847  	SUBL	$16, CX
   848  	ADDL	$16, AX
   849  	JMP	aesloop
   850  // 1-16 bytes remaining
   851  aesloopend:
   852  	// This load may overlap with the previous load above.
   853  	// We'll hash some bytes twice, but that's ok.
   854  	MOVOU	-16(AX)(CX*1), X1
   855  	JMP	partial
   856  // 0-15 bytes
   857  aessmall:
   858  	TESTL	CX, CX
   859  	JE	finalize	// 0 bytes
   860  
   861  	CMPB	AX, $0xf0
   862  	JA	highpartial
   863  
   864  	// 16 bytes loaded at this address won't cross
   865  	// a page boundary, so we can load it directly.
   866  	MOVOU	(AX), X1
   867  	ADDL	CX, CX
   868  	PAND	masks<>(SB)(CX*8), X1
   869  	JMP	partial
   870  highpartial:
   871  	// address ends in 1111xxxx.  Might be up against
   872  	// a page boundary, so load ending at last byte.
   873  	// Then shift bytes down using pshufb.
   874  	MOVOU	-16(AX)(CX*1), X1
   875  	ADDL	CX, CX
   876  	PSHUFB	shifts<>(SB)(CX*8), X1
   877  partial:
   878  	// incorporate partial block into hash
   879  	AESENC	X3, X0
   880  	AESENC	X1, X0
   881  finalize:	
   882  	// finalize hash
   883  	AESENC	X2, X0
   884  	AESENC	X3, X0
   885  	AESENC	X2, X0
   886  	MOVL	X0, (DX)
   887  	RET
   888  
   889  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
   890  	MOVL	4(SP), DX	// ptr to hash value
   891  	MOVL	12(SP), AX	// ptr to data
   892  	MOVL	(DX), X0	// seed
   893  	PINSRD	$1, (AX), X0	// data
   894  	AESENC	runtime·aeskeysched+0(SB), X0
   895  	AESENC	runtime·aeskeysched+16(SB), X0
   896  	AESENC	runtime·aeskeysched+0(SB), X0
   897  	MOVL	X0, (DX)
   898  	RET
   899  
   900  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
   901  	MOVL	4(SP), DX	// ptr to hash value
   902  	MOVL	12(SP), AX	// ptr to data
   903  	MOVQ	(AX), X0	// data
   904  	PINSRD	$2, (DX), X0	// seed
   905  	AESENC	runtime·aeskeysched+0(SB), X0
   906  	AESENC	runtime·aeskeysched+16(SB), X0
   907  	AESENC	runtime·aeskeysched+0(SB), X0
   908  	MOVL	X0, (DX)
   909  	RET
   910  
   911  // simple mask to get rid of data in the high part of the register.
   912  DATA masks<>+0x00(SB)/4, $0x00000000
   913  DATA masks<>+0x04(SB)/4, $0x00000000
   914  DATA masks<>+0x08(SB)/4, $0x00000000
   915  DATA masks<>+0x0c(SB)/4, $0x00000000
   916  	
   917  DATA masks<>+0x10(SB)/4, $0x000000ff
   918  DATA masks<>+0x14(SB)/4, $0x00000000
   919  DATA masks<>+0x18(SB)/4, $0x00000000
   920  DATA masks<>+0x1c(SB)/4, $0x00000000
   921  	
   922  DATA masks<>+0x20(SB)/4, $0x0000ffff
   923  DATA masks<>+0x24(SB)/4, $0x00000000
   924  DATA masks<>+0x28(SB)/4, $0x00000000
   925  DATA masks<>+0x2c(SB)/4, $0x00000000
   926  	
   927  DATA masks<>+0x30(SB)/4, $0x00ffffff
   928  DATA masks<>+0x34(SB)/4, $0x00000000
   929  DATA masks<>+0x38(SB)/4, $0x00000000
   930  DATA masks<>+0x3c(SB)/4, $0x00000000
   931  	
   932  DATA masks<>+0x40(SB)/4, $0xffffffff
   933  DATA masks<>+0x44(SB)/4, $0x00000000
   934  DATA masks<>+0x48(SB)/4, $0x00000000
   935  DATA masks<>+0x4c(SB)/4, $0x00000000
   936  	
   937  DATA masks<>+0x50(SB)/4, $0xffffffff
   938  DATA masks<>+0x54(SB)/4, $0x000000ff
   939  DATA masks<>+0x58(SB)/4, $0x00000000
   940  DATA masks<>+0x5c(SB)/4, $0x00000000
   941  	
   942  DATA masks<>+0x60(SB)/4, $0xffffffff
   943  DATA masks<>+0x64(SB)/4, $0x0000ffff
   944  DATA masks<>+0x68(SB)/4, $0x00000000
   945  DATA masks<>+0x6c(SB)/4, $0x00000000
   946  	
   947  DATA masks<>+0x70(SB)/4, $0xffffffff
   948  DATA masks<>+0x74(SB)/4, $0x00ffffff
   949  DATA masks<>+0x78(SB)/4, $0x00000000
   950  DATA masks<>+0x7c(SB)/4, $0x00000000
   951  	
   952  DATA masks<>+0x80(SB)/4, $0xffffffff
   953  DATA masks<>+0x84(SB)/4, $0xffffffff
   954  DATA masks<>+0x88(SB)/4, $0x00000000
   955  DATA masks<>+0x8c(SB)/4, $0x00000000
   956  	
   957  DATA masks<>+0x90(SB)/4, $0xffffffff
   958  DATA masks<>+0x94(SB)/4, $0xffffffff
   959  DATA masks<>+0x98(SB)/4, $0x000000ff
   960  DATA masks<>+0x9c(SB)/4, $0x00000000
   961  	
   962  DATA masks<>+0xa0(SB)/4, $0xffffffff
   963  DATA masks<>+0xa4(SB)/4, $0xffffffff
   964  DATA masks<>+0xa8(SB)/4, $0x0000ffff
   965  DATA masks<>+0xac(SB)/4, $0x00000000
   966  	
   967  DATA masks<>+0xb0(SB)/4, $0xffffffff
   968  DATA masks<>+0xb4(SB)/4, $0xffffffff
   969  DATA masks<>+0xb8(SB)/4, $0x00ffffff
   970  DATA masks<>+0xbc(SB)/4, $0x00000000
   971  	
   972  DATA masks<>+0xc0(SB)/4, $0xffffffff
   973  DATA masks<>+0xc4(SB)/4, $0xffffffff
   974  DATA masks<>+0xc8(SB)/4, $0xffffffff
   975  DATA masks<>+0xcc(SB)/4, $0x00000000
   976  	
   977  DATA masks<>+0xd0(SB)/4, $0xffffffff
   978  DATA masks<>+0xd4(SB)/4, $0xffffffff
   979  DATA masks<>+0xd8(SB)/4, $0xffffffff
   980  DATA masks<>+0xdc(SB)/4, $0x000000ff
   981  	
   982  DATA masks<>+0xe0(SB)/4, $0xffffffff
   983  DATA masks<>+0xe4(SB)/4, $0xffffffff
   984  DATA masks<>+0xe8(SB)/4, $0xffffffff
   985  DATA masks<>+0xec(SB)/4, $0x0000ffff
   986  	
   987  DATA masks<>+0xf0(SB)/4, $0xffffffff
   988  DATA masks<>+0xf4(SB)/4, $0xffffffff
   989  DATA masks<>+0xf8(SB)/4, $0xffffffff
   990  DATA masks<>+0xfc(SB)/4, $0x00ffffff
   991  
   992  GLOBL masks<>(SB),RODATA,$256
   993  
   994  // these are arguments to pshufb.  They move data down from
   995  // the high bytes of the register to the low bytes of the register.
   996  // index is how many bytes to move.
   997  DATA shifts<>+0x00(SB)/4, $0x00000000
   998  DATA shifts<>+0x04(SB)/4, $0x00000000
   999  DATA shifts<>+0x08(SB)/4, $0x00000000
  1000  DATA shifts<>+0x0c(SB)/4, $0x00000000
  1001  	
  1002  DATA shifts<>+0x10(SB)/4, $0xffffff0f
  1003  DATA shifts<>+0x14(SB)/4, $0xffffffff
  1004  DATA shifts<>+0x18(SB)/4, $0xffffffff
  1005  DATA shifts<>+0x1c(SB)/4, $0xffffffff
  1006  	
  1007  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
  1008  DATA shifts<>+0x24(SB)/4, $0xffffffff
  1009  DATA shifts<>+0x28(SB)/4, $0xffffffff
  1010  DATA shifts<>+0x2c(SB)/4, $0xffffffff
  1011  	
  1012  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
  1013  DATA shifts<>+0x34(SB)/4, $0xffffffff
  1014  DATA shifts<>+0x38(SB)/4, $0xffffffff
  1015  DATA shifts<>+0x3c(SB)/4, $0xffffffff
  1016  	
  1017  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
  1018  DATA shifts<>+0x44(SB)/4, $0xffffffff
  1019  DATA shifts<>+0x48(SB)/4, $0xffffffff
  1020  DATA shifts<>+0x4c(SB)/4, $0xffffffff
  1021  	
  1022  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
  1023  DATA shifts<>+0x54(SB)/4, $0xffffff0f
  1024  DATA shifts<>+0x58(SB)/4, $0xffffffff
  1025  DATA shifts<>+0x5c(SB)/4, $0xffffffff
  1026  	
  1027  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
  1028  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
  1029  DATA shifts<>+0x68(SB)/4, $0xffffffff
  1030  DATA shifts<>+0x6c(SB)/4, $0xffffffff
  1031  	
  1032  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
  1033  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
  1034  DATA shifts<>+0x78(SB)/4, $0xffffffff
  1035  DATA shifts<>+0x7c(SB)/4, $0xffffffff
  1036  	
  1037  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
  1038  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
  1039  DATA shifts<>+0x88(SB)/4, $0xffffffff
  1040  DATA shifts<>+0x8c(SB)/4, $0xffffffff
  1041  	
  1042  DATA shifts<>+0x90(SB)/4, $0x0a090807
  1043  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
  1044  DATA shifts<>+0x98(SB)/4, $0xffffff0f
  1045  DATA shifts<>+0x9c(SB)/4, $0xffffffff
  1046  	
  1047  DATA shifts<>+0xa0(SB)/4, $0x09080706
  1048  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
  1049  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
  1050  DATA shifts<>+0xac(SB)/4, $0xffffffff
  1051  	
  1052  DATA shifts<>+0xb0(SB)/4, $0x08070605
  1053  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
  1054  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
  1055  DATA shifts<>+0xbc(SB)/4, $0xffffffff
  1056  	
  1057  DATA shifts<>+0xc0(SB)/4, $0x07060504
  1058  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
  1059  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
  1060  DATA shifts<>+0xcc(SB)/4, $0xffffffff
  1061  	
  1062  DATA shifts<>+0xd0(SB)/4, $0x06050403
  1063  DATA shifts<>+0xd4(SB)/4, $0x0a090807
  1064  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
  1065  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
  1066  	
  1067  DATA shifts<>+0xe0(SB)/4, $0x05040302
  1068  DATA shifts<>+0xe4(SB)/4, $0x09080706
  1069  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
  1070  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
  1071  	
  1072  DATA shifts<>+0xf0(SB)/4, $0x04030201
  1073  DATA shifts<>+0xf4(SB)/4, $0x08070605
  1074  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
  1075  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
  1076  
  1077  GLOBL shifts<>(SB),RODATA,$256
  1078  
  1079  TEXT runtime·memeq(SB),NOSPLIT,$0-12
  1080  	MOVL	a+0(FP), SI
  1081  	MOVL	b+4(FP), DI
  1082  	MOVL	count+8(FP), BX
  1083  	JMP	runtime·memeqbody(SB)
  1084  
  1085  TEXT bytes·Equal(SB),NOSPLIT,$0-25
  1086  	MOVL	a_len+4(FP), BX
  1087  	MOVL	b_len+16(FP), CX
  1088  	XORL	AX, AX
  1089  	CMPL	BX, CX
  1090  	JNE	eqret
  1091  	MOVL	a+0(FP), SI
  1092  	MOVL	b+12(FP), DI
  1093  	CALL	runtime·memeqbody(SB)
  1094  eqret:
  1095  	MOVB	AX, ret+24(FP)
  1096  	RET
  1097  
  1098  // a in SI
  1099  // b in DI
  1100  // count in BX
  1101  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1102  	XORL	AX, AX
  1103  
  1104  	CMPL	BX, $4
  1105  	JB	small
  1106  
  1107  	// 64 bytes at a time using xmm registers
  1108  hugeloop:
  1109  	CMPL	BX, $64
  1110  	JB	bigloop
  1111  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1112  	JE	bigloop
  1113  	MOVOU	(SI), X0
  1114  	MOVOU	(DI), X1
  1115  	MOVOU	16(SI), X2
  1116  	MOVOU	16(DI), X3
  1117  	MOVOU	32(SI), X4
  1118  	MOVOU	32(DI), X5
  1119  	MOVOU	48(SI), X6
  1120  	MOVOU	48(DI), X7
  1121  	PCMPEQB	X1, X0
  1122  	PCMPEQB	X3, X2
  1123  	PCMPEQB	X5, X4
  1124  	PCMPEQB	X7, X6
  1125  	PAND	X2, X0
  1126  	PAND	X6, X4
  1127  	PAND	X4, X0
  1128  	PMOVMSKB X0, DX
  1129  	ADDL	$64, SI
  1130  	ADDL	$64, DI
  1131  	SUBL	$64, BX
  1132  	CMPL	DX, $0xffff
  1133  	JEQ	hugeloop
  1134  	RET
  1135  
  1136  	// 4 bytes at a time using 32-bit register
  1137  bigloop:
  1138  	CMPL	BX, $4
  1139  	JBE	leftover
  1140  	MOVL	(SI), CX
  1141  	MOVL	(DI), DX
  1142  	ADDL	$4, SI
  1143  	ADDL	$4, DI
  1144  	SUBL	$4, BX
  1145  	CMPL	CX, DX
  1146  	JEQ	bigloop
  1147  	RET
  1148  
  1149  	// remaining 0-4 bytes
  1150  leftover:
  1151  	MOVL	-4(SI)(BX*1), CX
  1152  	MOVL	-4(DI)(BX*1), DX
  1153  	CMPL	CX, DX
  1154  	SETEQ	AX
  1155  	RET
  1156  
  1157  small:
  1158  	CMPL	BX, $0
  1159  	JEQ	equal
  1160  
  1161  	LEAL	0(BX*8), CX
  1162  	NEGL	CX
  1163  
  1164  	MOVL	SI, DX
  1165  	CMPB	DX, $0xfc
  1166  	JA	si_high
  1167  
  1168  	// load at SI won't cross a page boundary.
  1169  	MOVL	(SI), SI
  1170  	JMP	si_finish
  1171  si_high:
  1172  	// address ends in 111111xx.  Load up to bytes we want, move to correct position.
  1173  	MOVL	-4(SI)(BX*1), SI
  1174  	SHRL	CX, SI
  1175  si_finish:
  1176  
  1177  	// same for DI.
  1178  	MOVL	DI, DX
  1179  	CMPB	DX, $0xfc
  1180  	JA	di_high
  1181  	MOVL	(DI), DI
  1182  	JMP	di_finish
  1183  di_high:
  1184  	MOVL	-4(DI)(BX*1), DI
  1185  	SHRL	CX, DI
  1186  di_finish:
  1187  
  1188  	SUBL	SI, DI
  1189  	SHLL	CX, DI
  1190  equal:
  1191  	SETEQ	AX
  1192  	RET
  1193  
  1194  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
  1195  	MOVL	s1+0(FP), SI
  1196  	MOVL	s1+4(FP), BX
  1197  	MOVL	s2+8(FP), DI
  1198  	MOVL	s2+12(FP), DX
  1199  	CALL	runtime·cmpbody(SB)
  1200  	MOVL	AX, res+16(FP)
  1201  	RET
  1202  
  1203  TEXT bytes·Compare(SB),NOSPLIT,$0-28
  1204  	MOVL	s1+0(FP), SI
  1205  	MOVL	s1+4(FP), BX
  1206  	MOVL	s2+12(FP), DI
  1207  	MOVL	s2+16(FP), DX
  1208  	CALL	runtime·cmpbody(SB)
  1209  	MOVL	AX, res+24(FP)
  1210  	RET
  1211  
  1212  TEXT bytes·IndexByte(SB),NOSPLIT,$0
  1213  	MOVL	s+0(FP), SI
  1214  	MOVL	s_len+4(FP), CX
  1215  	MOVB	c+12(FP), AL
  1216  	MOVL	SI, DI
  1217  	CLD; REPN; SCASB
  1218  	JZ 3(PC)
  1219  	MOVL	$-1, ret+16(FP)
  1220  	RET
  1221  	SUBL	SI, DI
  1222  	SUBL	$1, DI
  1223  	MOVL	DI, ret+16(FP)
  1224  	RET
  1225  
  1226  TEXT strings·IndexByte(SB),NOSPLIT,$0
  1227  	MOVL	s+0(FP), SI
  1228  	MOVL	s_len+4(FP), CX
  1229  	MOVB	c+8(FP), AL
  1230  	MOVL	SI, DI
  1231  	CLD; REPN; SCASB
  1232  	JZ 3(PC)
  1233  	MOVL	$-1, ret+12(FP)
  1234  	RET
  1235  	SUBL	SI, DI
  1236  	SUBL	$1, DI
  1237  	MOVL	DI, ret+12(FP)
  1238  	RET
  1239  
  1240  // input:
  1241  //   SI = a
  1242  //   DI = b
  1243  //   BX = alen
  1244  //   DX = blen
  1245  // output:
  1246  //   AX = 1/0/-1
  1247  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1248  	CMPL	SI, DI
  1249  	JEQ	cmp_allsame
  1250  	CMPL	BX, DX
  1251  	MOVL	DX, BP
  1252  	CMOVLLT	BX, BP // BP = min(alen, blen)
  1253  	CMPL	BP, $4
  1254  	JB	cmp_small
  1255  	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
  1256  	JE	cmp_mediumloop
  1257  cmp_largeloop:
  1258  	CMPL	BP, $16
  1259  	JB	cmp_mediumloop
  1260  	MOVOU	(SI), X0
  1261  	MOVOU	(DI), X1
  1262  	PCMPEQB X0, X1
  1263  	PMOVMSKB X1, AX
  1264  	XORL	$0xffff, AX	// convert EQ to NE
  1265  	JNE	cmp_diff16	// branch if at least one byte is not equal
  1266  	ADDL	$16, SI
  1267  	ADDL	$16, DI
  1268  	SUBL	$16, BP
  1269  	JMP	cmp_largeloop
  1270  
  1271  cmp_diff16:
  1272  	BSFL	AX, BX	// index of first byte that differs
  1273  	XORL	AX, AX
  1274  	MOVB	(SI)(BX*1), CX
  1275  	CMPB	CX, (DI)(BX*1)
  1276  	SETHI	AX
  1277  	LEAL	-1(AX*2), AX	// convert 1/0 to +1/-1
  1278  	RET
  1279  
  1280  cmp_mediumloop:
  1281  	CMPL	BP, $4
  1282  	JBE	cmp_0through4
  1283  	MOVL	(SI), AX
  1284  	MOVL	(DI), CX
  1285  	CMPL	AX, CX
  1286  	JNE	cmp_diff4
  1287  	ADDL	$4, SI
  1288  	ADDL	$4, DI
  1289  	SUBL	$4, BP
  1290  	JMP	cmp_mediumloop
  1291  
  1292  cmp_0through4:
  1293  	MOVL	-4(SI)(BP*1), AX
  1294  	MOVL	-4(DI)(BP*1), CX
  1295  	CMPL	AX, CX
  1296  	JEQ	cmp_allsame
  1297  
  1298  cmp_diff4:
  1299  	BSWAPL	AX	// reverse order of bytes
  1300  	BSWAPL	CX
  1301  	XORL	AX, CX	// find bit differences
  1302  	BSRL	CX, CX	// index of highest bit difference
  1303  	SHRL	CX, AX	// move a's bit to bottom
  1304  	ANDL	$1, AX	// mask bit
  1305  	LEAL	-1(AX*2), AX // 1/0 => +1/-1
  1306  	RET
  1307  
  1308  	// 0-3 bytes in common
  1309  cmp_small:
  1310  	LEAL	(BP*8), CX
  1311  	NEGL	CX
  1312  	JEQ	cmp_allsame
  1313  
  1314  	// load si
  1315  	CMPB	SI, $0xfc
  1316  	JA	cmp_si_high
  1317  	MOVL	(SI), SI
  1318  	JMP	cmp_si_finish
  1319  cmp_si_high:
  1320  	MOVL	-4(SI)(BP*1), SI
  1321  	SHRL	CX, SI
  1322  cmp_si_finish:
  1323  	SHLL	CX, SI
  1324  
  1325  	// same for di
  1326  	CMPB	DI, $0xfc
  1327  	JA	cmp_di_high
  1328  	MOVL	(DI), DI
  1329  	JMP	cmp_di_finish
  1330  cmp_di_high:
  1331  	MOVL	-4(DI)(BP*1), DI
  1332  	SHRL	CX, DI
  1333  cmp_di_finish:
  1334  	SHLL	CX, DI
  1335  
  1336  	BSWAPL	SI	// reverse order of bytes
  1337  	BSWAPL	DI
  1338  	XORL	SI, DI	// find bit differences
  1339  	JEQ	cmp_allsame
  1340  	BSRL	DI, CX	// index of highest bit difference
  1341  	SHRL	CX, SI	// move a's bit to bottom
  1342  	ANDL	$1, SI	// mask bit
  1343  	LEAL	-1(SI*2), AX // 1/0 => +1/-1
  1344  	RET
  1345  
  1346  	// all the bytes in common are the same, so we just need
  1347  	// to compare the lengths.
  1348  cmp_allsame:
  1349  	XORL	AX, AX
  1350  	XORL	CX, CX
  1351  	CMPL	BX, DX
  1352  	SETGT	AX	// 1 if alen > blen
  1353  	SETEQ	CX	// 1 if alen == blen
  1354  	LEAL	-1(CX)(AX*2), AX	// 1,0,-1 result
  1355  	RET