github.com/rohankumardubey/syslog-redirector-golang@v0.0.0-20140320174030-4859f03d829a/src/pkg/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "zasm_GOOS_GOARCH.h"
     6  #include "funcdata.h"
     7  #include "../../cmd/ld/textflag.h"
     8  
     9  TEXT _rt0_go(SB),NOSPLIT,$0
    10  	// copy arguments forward on an even stack
    11  	MOVQ	DI, AX		// argc
    12  	MOVQ	SI, BX		// argv
    13  	SUBQ	$(4*8+7), SP		// 2args 2auto
    14  	ANDQ	$~15, SP
    15  	MOVQ	AX, 16(SP)
    16  	MOVQ	BX, 24(SP)
    17  	
    18  	// create istack out of the given (operating system) stack.
    19  	// _cgo_init may update stackguard.
    20  	MOVQ	$runtime·g0(SB), DI
    21  	LEAQ	(-64*1024+104)(SP), BX
    22  	MOVQ	BX, g_stackguard(DI)
    23  	MOVQ	BX, g_stackguard0(DI)
    24  	MOVQ	SP, g_stackbase(DI)
    25  
    26  	// find out information about the processor we're on
    27  	MOVQ	$0, AX
    28  	CPUID
    29  	CMPQ	AX, $0
    30  	JE	nocpuinfo
    31  	MOVQ	$1, AX
    32  	CPUID
    33  	MOVL	CX, runtime·cpuid_ecx(SB)
    34  	MOVL	DX, runtime·cpuid_edx(SB)
    35  nocpuinfo:	
    36  	
    37  	// if there is an _cgo_init, call it.
    38  	MOVQ	_cgo_init(SB), AX
    39  	TESTQ	AX, AX
    40  	JZ	needtls
    41  	// g0 already in DI
    42  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    43  	MOVQ	$setmg_gcc<>(SB), SI
    44  	CALL	AX
    45  	// update stackguard after _cgo_init
    46  	MOVQ	$runtime·g0(SB), CX
    47  	MOVQ	g_stackguard0(CX), AX
    48  	MOVQ	AX, g_stackguard(CX)
    49  	CMPL	runtime·iswindows(SB), $0
    50  	JEQ ok
    51  
    52  needtls:
    53  	// skip TLS setup on Plan 9
    54  	CMPL	runtime·isplan9(SB), $1
    55  	JEQ ok
    56  
    57  	LEAQ	runtime·tls0(SB), DI
    58  	CALL	runtime·settls(SB)
    59  
    60  	// store through it, to make sure it works
    61  	get_tls(BX)
    62  	MOVQ	$0x123, g(BX)
    63  	MOVQ	runtime·tls0(SB), AX
    64  	CMPQ	AX, $0x123
    65  	JEQ 2(PC)
    66  	MOVL	AX, 0	// abort
    67  ok:
    68  	// set the per-goroutine and per-mach "registers"
    69  	get_tls(BX)
    70  	LEAQ	runtime·g0(SB), CX
    71  	MOVQ	CX, g(BX)
    72  	LEAQ	runtime·m0(SB), AX
    73  	MOVQ	AX, m(BX)
    74  
    75  	// save m->g0 = g0
    76  	MOVQ	CX, m_g0(AX)
    77  
    78  	CLD				// convention is D is always left cleared
    79  	CALL	runtime·check(SB)
    80  
    81  	MOVL	16(SP), AX		// copy argc
    82  	MOVL	AX, 0(SP)
    83  	MOVQ	24(SP), AX		// copy argv
    84  	MOVQ	AX, 8(SP)
    85  	CALL	runtime·args(SB)
    86  	CALL	runtime·osinit(SB)
    87  	CALL	runtime·hashinit(SB)
    88  	CALL	runtime·schedinit(SB)
    89  
    90  	// create a new goroutine to start program
    91  	PUSHQ	$runtime·main·f(SB)		// entry
    92  	PUSHQ	$0			// arg size
    93  	ARGSIZE(16)
    94  	CALL	runtime·newproc(SB)
    95  	ARGSIZE(-1)
    96  	POPQ	AX
    97  	POPQ	AX
    98  
    99  	// start this M
   100  	CALL	runtime·mstart(SB)
   101  
   102  	MOVL	$0xf1, 0xf1  // crash
   103  	RET
   104  
   105  DATA	runtime·main·f+0(SB)/8,$runtime·main(SB)
   106  GLOBL	runtime·main·f(SB),RODATA,$8
   107  
   108  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   109  	BYTE	$0xcc
   110  	RET
   111  
   112  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   113  	// No per-thread init.
   114  	RET
   115  
   116  /*
   117   *  go-routine
   118   */
   119  
   120  // void gosave(Gobuf*)
   121  // save state in Gobuf; setjmp
   122  TEXT runtime·gosave(SB), NOSPLIT, $0-8
   123  	MOVQ	8(SP), AX		// gobuf
   124  	LEAQ	8(SP), BX		// caller's SP
   125  	MOVQ	BX, gobuf_sp(AX)
   126  	MOVQ	0(SP), BX		// caller's PC
   127  	MOVQ	BX, gobuf_pc(AX)
   128  	MOVQ	$0, gobuf_ret(AX)
   129  	MOVQ	$0, gobuf_ctxt(AX)
   130  	get_tls(CX)
   131  	MOVQ	g(CX), BX
   132  	MOVQ	BX, gobuf_g(AX)
   133  	RET
   134  
   135  // void gogo(Gobuf*)
   136  // restore state from Gobuf; longjmp
   137  TEXT runtime·gogo(SB), NOSPLIT, $0-8
   138  	MOVQ	8(SP), BX		// gobuf
   139  	MOVQ	gobuf_g(BX), DX
   140  	MOVQ	0(DX), CX		// make sure g != nil
   141  	get_tls(CX)
   142  	MOVQ	DX, g(CX)
   143  	MOVQ	gobuf_sp(BX), SP	// restore SP
   144  	MOVQ	gobuf_ret(BX), AX
   145  	MOVQ	gobuf_ctxt(BX), DX
   146  	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
   147  	MOVQ	$0, gobuf_ret(BX)
   148  	MOVQ	$0, gobuf_ctxt(BX)
   149  	MOVQ	gobuf_pc(BX), BX
   150  	JMP	BX
   151  
   152  // void mcall(void (*fn)(G*))
   153  // Switch to m->g0's stack, call fn(g).
   154  // Fn must never return.  It should gogo(&g->sched)
   155  // to keep running g.
   156  TEXT runtime·mcall(SB), NOSPLIT, $0-8
   157  	MOVQ	fn+0(FP), DI
   158  	
   159  	get_tls(CX)
   160  	MOVQ	g(CX), AX	// save state in g->sched
   161  	MOVQ	0(SP), BX	// caller's PC
   162  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   163  	LEAQ	8(SP), BX	// caller's SP
   164  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   165  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   166  
   167  	// switch to m->g0 & its stack, call fn
   168  	MOVQ	m(CX), BX
   169  	MOVQ	m_g0(BX), SI
   170  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   171  	JNE	3(PC)
   172  	MOVQ	$runtime·badmcall(SB), AX
   173  	JMP	AX
   174  	MOVQ	SI, g(CX)	// g = m->g0
   175  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   176  	PUSHQ	AX
   177  	ARGSIZE(8)
   178  	CALL	DI
   179  	POPQ	AX
   180  	MOVQ	$runtime·badmcall2(SB), AX
   181  	JMP	AX
   182  	RET
   183  
   184  /*
   185   * support for morestack
   186   */
   187  
   188  // Called during function prolog when more stack is needed.
   189  // Caller has already done get_tls(CX); MOVQ m(CX), BX.
   190  //
   191  // The traceback routines see morestack on a g0 as being
   192  // the top of a stack (for example, morestack calling newstack
   193  // calling the scheduler calling newm calling gc), so we must
   194  // record an argument size. For that purpose, it has no arguments.
   195  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   196  	// Cannot grow scheduler stack (m->g0).
   197  	MOVQ	m_g0(BX), SI
   198  	CMPQ	g(CX), SI
   199  	JNE	2(PC)
   200  	INT	$3
   201  
   202  	// Called from f.
   203  	// Set m->morebuf to f's caller.
   204  	MOVQ	8(SP), AX	// f's caller's PC
   205  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   206  	LEAQ	16(SP), AX	// f's caller's SP
   207  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   208  	MOVQ	AX, m_moreargp(BX)
   209  	get_tls(CX)
   210  	MOVQ	g(CX), SI
   211  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   212  
   213  	// Set g->sched to context in f.
   214  	MOVQ	0(SP), AX // f's PC
   215  	MOVQ	AX, (g_sched+gobuf_pc)(SI)
   216  	MOVQ	SI, (g_sched+gobuf_g)(SI)
   217  	LEAQ	8(SP), AX // f's SP
   218  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   219  	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
   220  
   221  	// Call newstack on m->g0's stack.
   222  	MOVQ	m_g0(BX), BP
   223  	MOVQ	BP, g(CX)
   224  	MOVQ	(g_sched+gobuf_sp)(BP), SP
   225  	CALL	runtime·newstack(SB)
   226  	MOVQ	$0, 0x1003	// crash if newstack returns
   227  	RET
   228  
   229  // Called from panic.  Mimics morestack,
   230  // reuses stack growth code to create a frame
   231  // with the desired args running the desired function.
   232  //
   233  // func call(fn *byte, arg *byte, argsize uint32).
   234  TEXT runtime·newstackcall(SB), NOSPLIT, $0-20
   235  	get_tls(CX)
   236  	MOVQ	m(CX), BX
   237  
   238  	// Save our caller's state as the PC and SP to
   239  	// restore when returning from f.
   240  	MOVQ	0(SP), AX	// our caller's PC
   241  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   242  	LEAQ	8(SP), AX	// our caller's SP
   243  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   244  	MOVQ	g(CX), AX
   245  	MOVQ	AX, (m_morebuf+gobuf_g)(BX)
   246  	
   247  	// Save our own state as the PC and SP to restore
   248  	// if this goroutine needs to be restarted.
   249  	MOVQ	$runtime·newstackcall(SB), (g_sched+gobuf_pc)(AX)
   250  	MOVQ	SP, (g_sched+gobuf_sp)(AX)
   251  
   252  	// Set up morestack arguments to call f on a new stack.
   253  	// We set f's frame size to 1, as a hint to newstack
   254  	// that this is a call from runtime·newstackcall.
   255  	// If it turns out that f needs a larger frame than
   256  	// the default stack, f's usual stack growth prolog will
   257  	// allocate a new segment (and recopy the arguments).
   258  	MOVQ	8(SP), AX	// fn
   259  	MOVQ	16(SP), DX	// arg frame
   260  	MOVL	24(SP), CX	// arg size
   261  
   262  	MOVQ	AX, m_cret(BX)	// f's PC
   263  	MOVQ	DX, m_moreargp(BX)	// argument frame pointer
   264  	MOVL	CX, m_moreargsize(BX)	// f's argument size
   265  	MOVL	$1, m_moreframesize(BX)	// f's frame size
   266  
   267  	// Call newstack on m->g0's stack.
   268  	MOVQ	m_g0(BX), BP
   269  	get_tls(CX)
   270  	MOVQ	BP, g(CX)
   271  	MOVQ	(g_sched+gobuf_sp)(BP), SP
   272  	CALL	runtime·newstack(SB)
   273  	MOVQ	$0, 0x1103	// crash if newstack returns
   274  	RET
   275  
   276  // reflect·call: call a function with the given argument list
   277  // func call(f *FuncVal, arg *byte, argsize uint32).
   278  // we don't have variable-sized frames, so we use a small number
   279  // of constant-sized-frame functions to encode a few bits of size in the pc.
   280  // Caution: ugly multiline assembly macros in your future!
   281  
   282  #define DISPATCH(NAME,MAXSIZE)		\
   283  	CMPQ	CX, $MAXSIZE;		\
   284  	JA	3(PC);			\
   285  	MOVQ	$runtime·NAME(SB), AX;	\
   286  	JMP	AX
   287  // Note: can't just "JMP runtime·NAME(SB)" - bad inlining results.
   288  
   289  TEXT reflect·call(SB), NOSPLIT, $0-20
   290  	MOVLQZX argsize+16(FP), CX
   291  	DISPATCH(call16, 16)
   292  	DISPATCH(call32, 32)
   293  	DISPATCH(call64, 64)
   294  	DISPATCH(call128, 128)
   295  	DISPATCH(call256, 256)
   296  	DISPATCH(call512, 512)
   297  	DISPATCH(call1024, 1024)
   298  	DISPATCH(call2048, 2048)
   299  	DISPATCH(call4096, 4096)
   300  	DISPATCH(call8192, 8192)
   301  	DISPATCH(call16384, 16384)
   302  	DISPATCH(call32768, 32768)
   303  	DISPATCH(call65536, 65536)
   304  	DISPATCH(call131072, 131072)
   305  	DISPATCH(call262144, 262144)
   306  	DISPATCH(call524288, 524288)
   307  	DISPATCH(call1048576, 1048576)
   308  	DISPATCH(call2097152, 2097152)
   309  	DISPATCH(call4194304, 4194304)
   310  	DISPATCH(call8388608, 8388608)
   311  	DISPATCH(call16777216, 16777216)
   312  	DISPATCH(call33554432, 33554432)
   313  	DISPATCH(call67108864, 67108864)
   314  	DISPATCH(call134217728, 134217728)
   315  	DISPATCH(call268435456, 268435456)
   316  	DISPATCH(call536870912, 536870912)
   317  	DISPATCH(call1073741824, 1073741824)
   318  	MOVQ	$runtime·badreflectcall(SB), AX
   319  	JMP	AX
   320  
   321  #define CALLFN(NAME,MAXSIZE)			\
   322  TEXT runtime·NAME(SB), WRAPPER, $MAXSIZE-20;		\
   323  	/* copy arguments to stack */		\
   324  	MOVQ	argptr+8(FP), SI;		\
   325  	MOVLQZX argsize+16(FP), CX;		\
   326  	MOVQ	SP, DI;				\
   327  	REP;MOVSB;				\
   328  	/* call function */			\
   329  	MOVQ	f+0(FP), DX;			\
   330  	CALL	(DX);				\
   331  	/* copy return values back */		\
   332  	MOVQ	argptr+8(FP), DI;		\
   333  	MOVLQZX	argsize+16(FP), CX;		\
   334  	MOVQ	SP, SI;				\
   335  	REP;MOVSB;				\
   336  	RET
   337  
   338  CALLFN(call16, 16)
   339  CALLFN(call32, 32)
   340  CALLFN(call64, 64)
   341  CALLFN(call128, 128)
   342  CALLFN(call256, 256)
   343  CALLFN(call512, 512)
   344  CALLFN(call1024, 1024)
   345  CALLFN(call2048, 2048)
   346  CALLFN(call4096, 4096)
   347  CALLFN(call8192, 8192)
   348  CALLFN(call16384, 16384)
   349  CALLFN(call32768, 32768)
   350  CALLFN(call65536, 65536)
   351  CALLFN(call131072, 131072)
   352  CALLFN(call262144, 262144)
   353  CALLFN(call524288, 524288)
   354  CALLFN(call1048576, 1048576)
   355  CALLFN(call2097152, 2097152)
   356  CALLFN(call4194304, 4194304)
   357  CALLFN(call8388608, 8388608)
   358  CALLFN(call16777216, 16777216)
   359  CALLFN(call33554432, 33554432)
   360  CALLFN(call67108864, 67108864)
   361  CALLFN(call134217728, 134217728)
   362  CALLFN(call268435456, 268435456)
   363  CALLFN(call536870912, 536870912)
   364  CALLFN(call1073741824, 1073741824)
   365  
   366  // Return point when leaving stack.
   367  //
   368  // Lessstack can appear in stack traces for the same reason
   369  // as morestack; in that context, it has 0 arguments.
   370  TEXT runtime·lessstack(SB), NOSPLIT, $0-0
   371  	// Save return value in m->cret
   372  	get_tls(CX)
   373  	MOVQ	m(CX), BX
   374  	MOVQ	AX, m_cret(BX)
   375  
   376  	// Call oldstack on m->g0's stack.
   377  	MOVQ	m_g0(BX), BP
   378  	MOVQ	BP, g(CX)
   379  	MOVQ	(g_sched+gobuf_sp)(BP), SP
   380  	CALL	runtime·oldstack(SB)
   381  	MOVQ	$0, 0x1004	// crash if oldstack returns
   382  	RET
   383  
   384  // morestack trampolines
   385  TEXT runtime·morestack00(SB),NOSPLIT,$0
   386  	get_tls(CX)
   387  	MOVQ	m(CX), BX
   388  	MOVQ	$0, AX
   389  	MOVQ	AX, m_moreframesize(BX)
   390  	MOVQ	$runtime·morestack(SB), AX
   391  	JMP	AX
   392  
   393  TEXT runtime·morestack01(SB),NOSPLIT,$0
   394  	get_tls(CX)
   395  	MOVQ	m(CX), BX
   396  	SHLQ	$32, AX
   397  	MOVQ	AX, m_moreframesize(BX)
   398  	MOVQ	$runtime·morestack(SB), AX
   399  	JMP	AX
   400  
   401  TEXT runtime·morestack10(SB),NOSPLIT,$0
   402  	get_tls(CX)
   403  	MOVQ	m(CX), BX
   404  	MOVLQZX	AX, AX
   405  	MOVQ	AX, m_moreframesize(BX)
   406  	MOVQ	$runtime·morestack(SB), AX
   407  	JMP	AX
   408  
   409  TEXT runtime·morestack11(SB),NOSPLIT,$0
   410  	get_tls(CX)
   411  	MOVQ	m(CX), BX
   412  	MOVQ	AX, m_moreframesize(BX)
   413  	MOVQ	$runtime·morestack(SB), AX
   414  	JMP	AX
   415  
   416  // subcases of morestack01
   417  // with const of 8,16,...48
   418  TEXT runtime·morestack8(SB),NOSPLIT,$0
   419  	MOVQ	$1, R8
   420  	MOVQ	$morestack<>(SB), AX
   421  	JMP	AX
   422  
   423  TEXT runtime·morestack16(SB),NOSPLIT,$0
   424  	MOVQ	$2, R8
   425  	MOVQ	$morestack<>(SB), AX
   426  	JMP	AX
   427  
   428  TEXT runtime·morestack24(SB),NOSPLIT,$0
   429  	MOVQ	$3, R8
   430  	MOVQ	$morestack<>(SB), AX
   431  	JMP	AX
   432  
   433  TEXT runtime·morestack32(SB),NOSPLIT,$0
   434  	MOVQ	$4, R8
   435  	MOVQ	$morestack<>(SB), AX
   436  	JMP	AX
   437  
   438  TEXT runtime·morestack40(SB),NOSPLIT,$0
   439  	MOVQ	$5, R8
   440  	MOVQ	$morestack<>(SB), AX
   441  	JMP	AX
   442  
   443  TEXT runtime·morestack48(SB),NOSPLIT,$0
   444  	MOVQ	$6, R8
   445  	MOVQ	$morestack<>(SB), AX
   446  	JMP	AX
   447  
   448  TEXT morestack<>(SB),NOSPLIT,$0
   449  	get_tls(CX)
   450  	MOVQ	m(CX), BX
   451  	SHLQ	$35, R8
   452  	MOVQ	R8, m_moreframesize(BX)
   453  	MOVQ	$runtime·morestack(SB), AX
   454  	JMP	AX
   455  
   456  // bool cas(int32 *val, int32 old, int32 new)
   457  // Atomically:
   458  //	if(*val == old){
   459  //		*val = new;
   460  //		return 1;
   461  //	} else
   462  //		return 0;
   463  TEXT runtime·cas(SB), NOSPLIT, $0-16
   464  	MOVQ	8(SP), BX
   465  	MOVL	16(SP), AX
   466  	MOVL	20(SP), CX
   467  	LOCK
   468  	CMPXCHGL	CX, 0(BX)
   469  	JZ 3(PC)
   470  	MOVL	$0, AX
   471  	RET
   472  	MOVL	$1, AX
   473  	RET
   474  
   475  // bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
   476  // Atomically:
   477  //	if(*val == *old){
   478  //		*val = new;
   479  //		return 1;
   480  //	} else {
   481  //		return 0;
   482  //	}
   483  TEXT runtime·cas64(SB), NOSPLIT, $0-24
   484  	MOVQ	8(SP), BX
   485  	MOVQ	16(SP), AX
   486  	MOVQ	24(SP), CX
   487  	LOCK
   488  	CMPXCHGQ	CX, 0(BX)
   489  	JNZ	cas64_fail
   490  	MOVL	$1, AX
   491  	RET
   492  cas64_fail:
   493  	MOVL	$0, AX
   494  	RET
   495  
   496  // bool casp(void **val, void *old, void *new)
   497  // Atomically:
   498  //	if(*val == old){
   499  //		*val = new;
   500  //		return 1;
   501  //	} else
   502  //		return 0;
   503  TEXT runtime·casp(SB), NOSPLIT, $0-24
   504  	MOVQ	8(SP), BX
   505  	MOVQ	16(SP), AX
   506  	MOVQ	24(SP), CX
   507  	LOCK
   508  	CMPXCHGQ	CX, 0(BX)
   509  	JZ 3(PC)
   510  	MOVL	$0, AX
   511  	RET
   512  	MOVL	$1, AX
   513  	RET
   514  
   515  // uint32 xadd(uint32 volatile *val, int32 delta)
   516  // Atomically:
   517  //	*val += delta;
   518  //	return *val;
   519  TEXT runtime·xadd(SB), NOSPLIT, $0-12
   520  	MOVQ	8(SP), BX
   521  	MOVL	16(SP), AX
   522  	MOVL	AX, CX
   523  	LOCK
   524  	XADDL	AX, 0(BX)
   525  	ADDL	CX, AX
   526  	RET
   527  
   528  TEXT runtime·xadd64(SB), NOSPLIT, $0-16
   529  	MOVQ	8(SP), BX
   530  	MOVQ	16(SP), AX
   531  	MOVQ	AX, CX
   532  	LOCK
   533  	XADDQ	AX, 0(BX)
   534  	ADDQ	CX, AX
   535  	RET
   536  
   537  TEXT runtime·xchg(SB), NOSPLIT, $0-12
   538  	MOVQ	8(SP), BX
   539  	MOVL	16(SP), AX
   540  	XCHGL	AX, 0(BX)
   541  	RET
   542  
   543  TEXT runtime·xchg64(SB), NOSPLIT, $0-16
   544  	MOVQ	8(SP), BX
   545  	MOVQ	16(SP), AX
   546  	XCHGQ	AX, 0(BX)
   547  	RET
   548  
   549  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   550  	MOVL	8(SP), AX
   551  again:
   552  	PAUSE
   553  	SUBL	$1, AX
   554  	JNZ	again
   555  	RET
   556  
   557  TEXT runtime·atomicstorep(SB), NOSPLIT, $0-16
   558  	MOVQ	8(SP), BX
   559  	MOVQ	16(SP), AX
   560  	XCHGQ	AX, 0(BX)
   561  	RET
   562  
   563  TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
   564  	MOVQ	8(SP), BX
   565  	MOVL	16(SP), AX
   566  	XCHGL	AX, 0(BX)
   567  	RET
   568  
   569  TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
   570  	MOVQ	8(SP), BX
   571  	MOVQ	16(SP), AX
   572  	XCHGQ	AX, 0(BX)
   573  	RET
   574  
   575  // void jmpdefer(fn, sp);
   576  // called from deferreturn.
   577  // 1. pop the caller
   578  // 2. sub 5 bytes from the callers return
   579  // 3. jmp to the argument
   580  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
   581  	MOVQ	8(SP), DX	// fn
   582  	MOVQ	16(SP), BX	// caller sp
   583  	LEAQ	-8(BX), SP	// caller sp after CALL
   584  	SUBQ	$5, (SP)	// return to CALL again
   585  	MOVQ	0(DX), BX
   586  	JMP	BX	// but first run the deferred function
   587  
   588  // Save state of caller into g->sched. Smashes R8, R9.
   589  TEXT gosave<>(SB),NOSPLIT,$0
   590  	get_tls(R8)
   591  	MOVQ	g(R8), R8
   592  	MOVQ	0(SP), R9
   593  	MOVQ	R9, (g_sched+gobuf_pc)(R8)
   594  	LEAQ	8(SP), R9
   595  	MOVQ	R9, (g_sched+gobuf_sp)(R8)
   596  	MOVQ	$0, (g_sched+gobuf_ret)(R8)
   597  	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
   598  	RET
   599  
   600  // asmcgocall(void(*fn)(void*), void *arg)
   601  // Call fn(arg) on the scheduler stack,
   602  // aligned appropriately for the gcc ABI.
   603  // See cgocall.c for more details.
   604  TEXT runtime·asmcgocall(SB),NOSPLIT,$0-16
   605  	MOVQ	fn+0(FP), AX
   606  	MOVQ	arg+8(FP), BX
   607  	MOVQ	SP, DX
   608  
   609  	// Figure out if we need to switch to m->g0 stack.
   610  	// We get called to create new OS threads too, and those
   611  	// come in on the m->g0 stack already.
   612  	get_tls(CX)
   613  	MOVQ	m(CX), BP
   614  	MOVQ	m_g0(BP), SI
   615  	MOVQ	g(CX), DI
   616  	CMPQ	SI, DI
   617  	JEQ	4(PC)
   618  	CALL	gosave<>(SB)
   619  	MOVQ	SI, g(CX)
   620  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   621  
   622  	// Now on a scheduling stack (a pthread-created stack).
   623  	// Make sure we have enough room for 4 stack-backed fast-call
   624  	// registers as per windows amd64 calling convention.
   625  	SUBQ	$64, SP
   626  	ANDQ	$~15, SP	// alignment for gcc ABI
   627  	MOVQ	DI, 48(SP)	// save g
   628  	MOVQ	DX, 40(SP)	// save SP
   629  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   630  	MOVQ	BX, CX		// CX = first argument in Win64
   631  	CALL	AX
   632  
   633  	// Restore registers, g, stack pointer.
   634  	get_tls(CX)
   635  	MOVQ	48(SP), DI
   636  	MOVQ	DI, g(CX)
   637  	MOVQ	40(SP), SP
   638  	RET
   639  
   640  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   641  // Turn the fn into a Go func (by taking its address) and call
   642  // cgocallback_gofunc.
   643  TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
   644  	LEAQ	fn+0(FP), AX
   645  	MOVQ	AX, 0(SP)
   646  	MOVQ	frame+8(FP), AX
   647  	MOVQ	AX, 8(SP)
   648  	MOVQ	framesize+16(FP), AX
   649  	MOVQ	AX, 16(SP)
   650  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   651  	CALL	AX
   652  	RET
   653  
   654  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   655  // See cgocall.c for more details.
   656  TEXT runtime·cgocallback_gofunc(SB),NOSPLIT,$8-24
   657  	// If m is nil, Go did not create the current thread.
   658  	// Call needm to obtain one for temporary use.
   659  	// In this case, we're running on the thread stack, so there's
   660  	// lots of space, but the linker doesn't know. Hide the call from
   661  	// the linker analysis by using an indirect call through AX.
   662  	get_tls(CX)
   663  #ifdef GOOS_windows
   664  	MOVL	$0, BP
   665  	CMPQ	CX, $0
   666  	JEQ	2(PC)
   667  #endif
   668  	MOVQ	m(CX), BP
   669  	MOVQ	BP, R8 // holds oldm until end of function
   670  	CMPQ	BP, $0
   671  	JNE	havem
   672  needm:
   673  	MOVQ	R8, 0(SP)
   674  	MOVQ	$runtime·needm(SB), AX
   675  	CALL	AX
   676  	MOVQ	0(SP), R8
   677  	get_tls(CX)
   678  	MOVQ	m(CX), BP
   679  
   680  havem:
   681  	// Now there's a valid m, and we're running on its m->g0.
   682  	// Save current m->g0->sched.sp on stack and then set it to SP.
   683  	// Save current sp in m->g0->sched.sp in preparation for
   684  	// switch back to m->curg stack.
   685  	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
   686  	MOVQ	m_g0(BP), SI
   687  	MOVQ	(g_sched+gobuf_sp)(SI), AX
   688  	MOVQ	AX, 0(SP)
   689  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   690  
   691  	// Switch to m->curg stack and call runtime.cgocallbackg.
   692  	// Because we are taking over the execution of m->curg
   693  	// but *not* resuming what had been running, we need to
   694  	// save that information (m->curg->sched) so we can restore it.
   695  	// We can restore m->curg->sched.sp easily, because calling
   696  	// runtime.cgocallbackg leaves SP unchanged upon return.
   697  	// To save m->curg->sched.pc, we push it onto the stack.
   698  	// This has the added benefit that it looks to the traceback
   699  	// routine like cgocallbackg is going to return to that
   700  	// PC (because the frame we allocate below has the same
   701  	// size as cgocallback_gofunc's frame declared above)
   702  	// so that the traceback will seamlessly trace back into
   703  	// the earlier calls.
   704  	//
   705  	// In the new goroutine, 0(SP) holds the saved R8.
   706  	MOVQ	m_curg(BP), SI
   707  	MOVQ	SI, g(CX)
   708  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   709  	MOVQ	(g_sched+gobuf_pc)(SI), BP
   710  	MOVQ	BP, -8(DI)
   711  	LEAQ	-(8+8)(DI), SP
   712  	MOVQ	R8, 0(SP)
   713  	CALL	runtime·cgocallbackg(SB)
   714  	MOVQ	0(SP), R8
   715  
   716  	// Restore g->sched (== m->curg->sched) from saved values.
   717  	get_tls(CX)
   718  	MOVQ	g(CX), SI
   719  	MOVQ	8(SP), BP
   720  	MOVQ	BP, (g_sched+gobuf_pc)(SI)
   721  	LEAQ	(8+8)(SP), DI
   722  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   723  
   724  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   725  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   726  	// so we do not have to restore it.)
   727  	MOVQ	m(CX), BP
   728  	MOVQ	m_g0(BP), SI
   729  	MOVQ	SI, g(CX)
   730  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   731  	MOVQ	0(SP), AX
   732  	MOVQ	AX, (g_sched+gobuf_sp)(SI)
   733  	
   734  	// If the m on entry was nil, we called needm above to borrow an m
   735  	// for the duration of the call. Since the call is over, return it with dropm.
   736  	CMPQ	R8, $0
   737  	JNE 3(PC)
   738  	MOVQ	$runtime·dropm(SB), AX
   739  	CALL	AX
   740  
   741  	// Done!
   742  	RET
   743  
   744  // void setmg(M*, G*); set m and g. for use by needm.
   745  TEXT runtime·setmg(SB), NOSPLIT, $0-16
   746  	MOVQ	mm+0(FP), AX
   747  #ifdef GOOS_windows
   748  	CMPQ	AX, $0
   749  	JNE	settls
   750  	MOVQ	$0, 0x28(GS)
   751  	RET
   752  settls:
   753  	LEAQ	m_tls(AX), AX
   754  	MOVQ	AX, 0x28(GS)
   755  #endif
   756  	get_tls(CX)
   757  	MOVQ	mm+0(FP), AX
   758  	MOVQ	AX, m(CX)
   759  	MOVQ	gg+8(FP), BX
   760  	MOVQ	BX, g(CX)
   761  	RET
   762  
   763  // void setmg_gcc(M*, G*); set m and g called from gcc.
   764  TEXT setmg_gcc<>(SB),NOSPLIT,$0
   765  	get_tls(AX)
   766  	MOVQ	DI, m(AX)
   767  	MOVQ	SI, g(AX)
   768  	RET
   769  
   770  // check that SP is in range [g->stackbase, g->stackguard)
   771  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   772  	get_tls(CX)
   773  	MOVQ	g(CX), AX
   774  	CMPQ	g_stackbase(AX), SP
   775  	JHI	2(PC)
   776  	INT	$3
   777  	CMPQ	SP, g_stackguard(AX)
   778  	JHI	2(PC)
   779  	INT	$3
   780  	RET
   781  
   782  TEXT runtime·memclr(SB),NOSPLIT,$0-16
   783  	MOVQ	8(SP), DI		// arg 1 addr
   784  	MOVQ	16(SP), CX		// arg 2 count
   785  	MOVQ	CX, BX
   786  	ANDQ	$7, BX
   787  	SHRQ	$3, CX
   788  	MOVQ	$0, AX
   789  	CLD
   790  	REP
   791  	STOSQ
   792  	MOVQ	BX, CX
   793  	REP
   794  	STOSB
   795  	RET
   796  
   797  TEXT runtime·getcallerpc(SB),NOSPLIT,$0-8
   798  	MOVQ	x+0(FP),AX		// addr of first arg
   799  	MOVQ	-8(AX),AX		// get calling pc
   800  	RET
   801  
   802  TEXT runtime·setcallerpc(SB),NOSPLIT,$0-16
   803  	MOVQ	x+0(FP),AX		// addr of first arg
   804  	MOVQ	x+8(FP), BX
   805  	MOVQ	BX, -8(AX)		// set calling pc
   806  	RET
   807  
   808  TEXT runtime·getcallersp(SB),NOSPLIT,$0-8
   809  	MOVQ	sp+0(FP), AX
   810  	RET
   811  
   812  // int64 runtime·cputicks(void)
   813  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   814  	RDTSC
   815  	SHLQ	$32, DX
   816  	ADDQ	DX, AX
   817  	RET
   818  
   819  TEXT runtime·stackguard(SB),NOSPLIT,$0-16
   820  	MOVQ	SP, DX
   821  	MOVQ	DX, sp+0(FP)
   822  	get_tls(CX)
   823  	MOVQ	g(CX), BX
   824  	MOVQ	g_stackguard(BX), DX
   825  	MOVQ	DX, limit+8(FP)
   826  	RET
   827  
   828  GLOBL runtime·tls0(SB), $64
   829  
   830  // hash function using AES hardware instructions
   831  TEXT runtime·aeshash(SB),NOSPLIT,$0-24
   832  	MOVQ	8(SP), DX	// ptr to hash value
   833  	MOVQ	16(SP), CX	// size
   834  	MOVQ	24(SP), AX	// ptr to data
   835  	JMP	runtime·aeshashbody(SB)
   836  
   837  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
   838  	MOVQ	8(SP), DX	// ptr to hash value
   839  	MOVQ	24(SP), AX	// ptr to string struct
   840  	MOVQ	8(AX), CX	// length of string
   841  	MOVQ	(AX), AX	// string data
   842  	JMP	runtime·aeshashbody(SB)
   843  
   844  // AX: data
   845  // CX: length
   846  // DX: ptr to seed input / hash output
   847  TEXT runtime·aeshashbody(SB),NOSPLIT,$0-24
   848  	MOVQ	(DX), X0	// seed to low 64 bits of xmm0
   849  	PINSRQ	$1, CX, X0	// size to high 64 bits of xmm0
   850  	MOVO	runtime·aeskeysched+0(SB), X2
   851  	MOVO	runtime·aeskeysched+16(SB), X3
   852  	CMPQ	CX, $16
   853  	JB	aessmall
   854  aesloop:
   855  	CMPQ	CX, $16
   856  	JBE	aesloopend
   857  	MOVOU	(AX), X1
   858  	AESENC	X2, X0
   859  	AESENC	X1, X0
   860  	SUBQ	$16, CX
   861  	ADDQ	$16, AX
   862  	JMP	aesloop
   863  // 1-16 bytes remaining
   864  aesloopend:
   865  	// This load may overlap with the previous load above.
   866  	// We'll hash some bytes twice, but that's ok.
   867  	MOVOU	-16(AX)(CX*1), X1
   868  	JMP	partial
   869  // 0-15 bytes
   870  aessmall:
   871  	TESTQ	CX, CX
   872  	JE	finalize	// 0 bytes
   873  
   874  	CMPB	AX, $0xf0
   875  	JA	highpartial
   876  
   877  	// 16 bytes loaded at this address won't cross
   878  	// a page boundary, so we can load it directly.
   879  	MOVOU	(AX), X1
   880  	ADDQ	CX, CX
   881  	PAND	masks<>(SB)(CX*8), X1
   882  	JMP	partial
   883  highpartial:
   884  	// address ends in 1111xxxx.  Might be up against
   885  	// a page boundary, so load ending at last byte.
   886  	// Then shift bytes down using pshufb.
   887  	MOVOU	-16(AX)(CX*1), X1
   888  	ADDQ	CX, CX
   889  	PSHUFB	shifts<>(SB)(CX*8), X1
   890  partial:
   891  	// incorporate partial block into hash
   892  	AESENC	X3, X0
   893  	AESENC	X1, X0
   894  finalize:	
   895  	// finalize hash
   896  	AESENC	X2, X0
   897  	AESENC	X3, X0
   898  	AESENC	X2, X0
   899  	MOVQ	X0, (DX)
   900  	RET
   901  
   902  TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
   903  	MOVQ	8(SP), DX	// ptr to hash value
   904  	MOVQ	24(SP), AX	// ptr to data
   905  	MOVQ	(DX), X0	// seed
   906  	PINSRD	$2, (AX), X0	// data
   907  	AESENC	runtime·aeskeysched+0(SB), X0
   908  	AESENC	runtime·aeskeysched+16(SB), X0
   909  	AESENC	runtime·aeskeysched+0(SB), X0
   910  	MOVQ	X0, (DX)
   911  	RET
   912  
   913  TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
   914  	MOVQ	8(SP), DX	// ptr to hash value
   915  	MOVQ	24(SP), AX	// ptr to data
   916  	MOVQ	(DX), X0	// seed
   917  	PINSRQ	$1, (AX), X0	// data
   918  	AESENC	runtime·aeskeysched+0(SB), X0
   919  	AESENC	runtime·aeskeysched+16(SB), X0
   920  	AESENC	runtime·aeskeysched+0(SB), X0
   921  	MOVQ	X0, (DX)
   922  	RET
   923  
   924  // simple mask to get rid of data in the high part of the register.
   925  DATA masks<>+0x00(SB)/8, $0x0000000000000000
   926  DATA masks<>+0x08(SB)/8, $0x0000000000000000
   927  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
   928  DATA masks<>+0x18(SB)/8, $0x0000000000000000
   929  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
   930  DATA masks<>+0x28(SB)/8, $0x0000000000000000
   931  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
   932  DATA masks<>+0x38(SB)/8, $0x0000000000000000
   933  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
   934  DATA masks<>+0x48(SB)/8, $0x0000000000000000
   935  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
   936  DATA masks<>+0x58(SB)/8, $0x0000000000000000
   937  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
   938  DATA masks<>+0x68(SB)/8, $0x0000000000000000
   939  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
   940  DATA masks<>+0x78(SB)/8, $0x0000000000000000
   941  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
   942  DATA masks<>+0x88(SB)/8, $0x0000000000000000
   943  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
   944  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
   945  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
   946  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
   947  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
   948  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
   949  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
   950  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
   951  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
   952  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
   953  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
   954  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
   955  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
   956  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
   957  GLOBL masks<>(SB),RODATA,$256
   958  
   959  // these are arguments to pshufb.  They move data down from
   960  // the high bytes of the register to the low bytes of the register.
   961  // index is how many bytes to move.
   962  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
   963  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
   964  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
   965  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
   966  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
   967  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
   968  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
   969  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
   970  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
   971  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
   972  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
   973  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
   974  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
   975  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
   976  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
   977  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
   978  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
   979  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
   980  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
   981  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
   982  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
   983  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
   984  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
   985  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
   986  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
   987  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
   988  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
   989  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
   990  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
   991  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
   992  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
   993  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
   994  GLOBL shifts<>(SB),RODATA,$256
   995  
   996  TEXT runtime·memeq(SB),NOSPLIT,$0-24
   997  	MOVQ	a+0(FP), SI
   998  	MOVQ	b+8(FP), DI
   999  	MOVQ	count+16(FP), BX
  1000  	JMP	runtime·memeqbody(SB)
  1001  
  1002  // a in SI
  1003  // b in DI
  1004  // count in BX
  1005  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
  1006  	XORQ	AX, AX
  1007  
  1008  	CMPQ	BX, $8
  1009  	JB	small
  1010  	
  1011  	// 64 bytes at a time using xmm registers
  1012  hugeloop:
  1013  	CMPQ	BX, $64
  1014  	JB	bigloop
  1015  	MOVOU	(SI), X0
  1016  	MOVOU	(DI), X1
  1017  	MOVOU	16(SI), X2
  1018  	MOVOU	16(DI), X3
  1019  	MOVOU	32(SI), X4
  1020  	MOVOU	32(DI), X5
  1021  	MOVOU	48(SI), X6
  1022  	MOVOU	48(DI), X7
  1023  	PCMPEQB	X1, X0
  1024  	PCMPEQB	X3, X2
  1025  	PCMPEQB	X5, X4
  1026  	PCMPEQB	X7, X6
  1027  	PAND	X2, X0
  1028  	PAND	X6, X4
  1029  	PAND	X4, X0
  1030  	PMOVMSKB X0, DX
  1031  	ADDQ	$64, SI
  1032  	ADDQ	$64, DI
  1033  	SUBQ	$64, BX
  1034  	CMPL	DX, $0xffff
  1035  	JEQ	hugeloop
  1036  	RET
  1037  
  1038  	// 8 bytes at a time using 64-bit register
  1039  bigloop:
  1040  	CMPQ	BX, $8
  1041  	JBE	leftover
  1042  	MOVQ	(SI), CX
  1043  	MOVQ	(DI), DX
  1044  	ADDQ	$8, SI
  1045  	ADDQ	$8, DI
  1046  	SUBQ	$8, BX
  1047  	CMPQ	CX, DX
  1048  	JEQ	bigloop
  1049  	RET
  1050  
  1051  	// remaining 0-8 bytes
  1052  leftover:
  1053  	MOVQ	-8(SI)(BX*1), CX
  1054  	MOVQ	-8(DI)(BX*1), DX
  1055  	CMPQ	CX, DX
  1056  	SETEQ	AX
  1057  	RET
  1058  
  1059  small:
  1060  	CMPQ	BX, $0
  1061  	JEQ	equal
  1062  
  1063  	LEAQ	0(BX*8), CX
  1064  	NEGQ	CX
  1065  
  1066  	CMPB	SI, $0xf8
  1067  	JA	si_high
  1068  
  1069  	// load at SI won't cross a page boundary.
  1070  	MOVQ	(SI), SI
  1071  	JMP	si_finish
  1072  si_high:
  1073  	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
  1074  	MOVQ	-8(SI)(BX*1), SI
  1075  	SHRQ	CX, SI
  1076  si_finish:
  1077  
  1078  	// same for DI.
  1079  	CMPB	DI, $0xf8
  1080  	JA	di_high
  1081  	MOVQ	(DI), DI
  1082  	JMP	di_finish
  1083  di_high:
  1084  	MOVQ	-8(DI)(BX*1), DI
  1085  	SHRQ	CX, DI
  1086  di_finish:
  1087  
  1088  	SUBQ	SI, DI
  1089  	SHLQ	CX, DI
  1090  equal:
  1091  	SETEQ	AX
  1092  	RET
  1093  
  1094  TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
  1095  	MOVQ	s1+0(FP), SI
  1096  	MOVQ	s1+8(FP), BX
  1097  	MOVQ	s2+16(FP), DI
  1098  	MOVQ	s2+24(FP), DX
  1099  	CALL	runtime·cmpbody(SB)
  1100  	MOVQ	AX, res+32(FP)
  1101  	RET
  1102  
  1103  TEXT bytes·Compare(SB),NOSPLIT,$0-56
  1104  	MOVQ	s1+0(FP), SI
  1105  	MOVQ	s1+8(FP), BX
  1106  	MOVQ	s2+24(FP), DI
  1107  	MOVQ	s2+32(FP), DX
  1108  	CALL	runtime·cmpbody(SB)
  1109  	MOVQ	AX, res+48(FP)
  1110  	RET
  1111  
  1112  // input:
  1113  //   SI = a
  1114  //   DI = b
  1115  //   BX = alen
  1116  //   DX = blen
  1117  // output:
  1118  //   AX = 1/0/-1
  1119  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
  1120  	CMPQ	SI, DI
  1121  	JEQ	cmp_allsame
  1122  	CMPQ	BX, DX
  1123  	MOVQ	DX, BP
  1124  	CMOVQLT	BX, BP // BP = min(alen, blen) = # of bytes to compare
  1125  	CMPQ	BP, $8
  1126  	JB	cmp_small
  1127  
  1128  cmp_loop:
  1129  	CMPQ	BP, $16
  1130  	JBE	cmp_0through16
  1131  	MOVOU	(SI), X0
  1132  	MOVOU	(DI), X1
  1133  	PCMPEQB X0, X1
  1134  	PMOVMSKB X1, AX
  1135  	XORQ	$0xffff, AX	// convert EQ to NE
  1136  	JNE	cmp_diff16	// branch if at least one byte is not equal
  1137  	ADDQ	$16, SI
  1138  	ADDQ	$16, DI
  1139  	SUBQ	$16, BP
  1140  	JMP	cmp_loop
  1141  	
  1142  	// AX = bit mask of differences
  1143  cmp_diff16:
  1144  	BSFQ	AX, BX	// index of first byte that differs
  1145  	XORQ	AX, AX
  1146  	MOVB	(SI)(BX*1), CX
  1147  	CMPB	CX, (DI)(BX*1)
  1148  	SETHI	AX
  1149  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
  1150  	RET
  1151  
  1152  	// 0 through 16 bytes left, alen>=8, blen>=8
  1153  cmp_0through16:
  1154  	CMPQ	BP, $8
  1155  	JBE	cmp_0through8
  1156  	MOVQ	(SI), AX
  1157  	MOVQ	(DI), CX
  1158  	CMPQ	AX, CX
  1159  	JNE	cmp_diff8
  1160  cmp_0through8:
  1161  	MOVQ	-8(SI)(BP*1), AX
  1162  	MOVQ	-8(DI)(BP*1), CX
  1163  	CMPQ	AX, CX
  1164  	JEQ	cmp_allsame
  1165  
  1166  	// AX and CX contain parts of a and b that differ.
  1167  cmp_diff8:
  1168  	BSWAPQ	AX	// reverse order of bytes
  1169  	BSWAPQ	CX
  1170  	XORQ	AX, CX
  1171  	BSRQ	CX, CX	// index of highest bit difference
  1172  	SHRQ	CX, AX	// move a's bit to bottom
  1173  	ANDQ	$1, AX	// mask bit
  1174  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
  1175  	RET
  1176  
  1177  	// 0-7 bytes in common
  1178  cmp_small:
  1179  	LEAQ	(BP*8), CX	// bytes left -> bits left
  1180  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
  1181  	JEQ	cmp_allsame
  1182  
  1183  	// load bytes of a into high bytes of AX
  1184  	CMPB	SI, $0xf8
  1185  	JA	cmp_si_high
  1186  	MOVQ	(SI), SI
  1187  	JMP	cmp_si_finish
  1188  cmp_si_high:
  1189  	MOVQ	-8(SI)(BP*1), SI
  1190  	SHRQ	CX, SI
  1191  cmp_si_finish:
  1192  	SHLQ	CX, SI
  1193  
  1194  	// load bytes of b in to high bytes of BX
  1195  	CMPB	DI, $0xf8
  1196  	JA	cmp_di_high
  1197  	MOVQ	(DI), DI
  1198  	JMP	cmp_di_finish
  1199  cmp_di_high:
  1200  	MOVQ	-8(DI)(BP*1), DI
  1201  	SHRQ	CX, DI
  1202  cmp_di_finish:
  1203  	SHLQ	CX, DI
  1204  
  1205  	BSWAPQ	SI	// reverse order of bytes
  1206  	BSWAPQ	DI
  1207  	XORQ	SI, DI	// find bit differences
  1208  	JEQ	cmp_allsame
  1209  	BSRQ	DI, CX	// index of highest bit difference
  1210  	SHRQ	CX, SI	// move a's bit to bottom
  1211  	ANDQ	$1, SI	// mask bit
  1212  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
  1213  	RET
  1214  
  1215  cmp_allsame:
  1216  	XORQ	AX, AX
  1217  	XORQ	CX, CX
  1218  	CMPQ	BX, DX
  1219  	SETGT	AX	// 1 if alen > blen
  1220  	SETEQ	CX	// 1 if alen == blen
  1221  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
  1222  	RET
  1223  
  1224  TEXT bytes·IndexByte(SB),NOSPLIT,$0
  1225  	MOVQ s+0(FP), SI
  1226  	MOVQ s_len+8(FP), BX
  1227  	MOVB c+24(FP), AL
  1228  	CALL runtime·indexbytebody(SB)
  1229  	MOVQ AX, ret+32(FP)
  1230  	RET
  1231  
  1232  TEXT strings·IndexByte(SB),NOSPLIT,$0
  1233  	MOVQ s+0(FP), SI
  1234  	MOVQ s_len+8(FP), BX
  1235  	MOVB c+16(FP), AL
  1236  	CALL runtime·indexbytebody(SB)
  1237  	MOVQ AX, ret+24(FP)
  1238  	RET
  1239  
  1240  // input:
  1241  //   SI: data
  1242  //   BX: data len
  1243  //   AL: byte sought
  1244  // output:
  1245  //   AX
  1246  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
  1247  	MOVQ SI, DI
  1248  
  1249  	CMPQ BX, $16
  1250  	JLT indexbyte_small
  1251  
  1252  	// round up to first 16-byte boundary
  1253  	TESTQ $15, SI
  1254  	JZ aligned
  1255  	MOVQ SI, CX
  1256  	ANDQ $~15, CX
  1257  	ADDQ $16, CX
  1258  
  1259  	// search the beginning
  1260  	SUBQ SI, CX
  1261  	REPN; SCASB
  1262  	JZ success
  1263  
  1264  // DI is 16-byte aligned; get ready to search using SSE instructions
  1265  aligned:
  1266  	// round down to last 16-byte boundary
  1267  	MOVQ BX, R11
  1268  	ADDQ SI, R11
  1269  	ANDQ $~15, R11
  1270  
  1271  	// shuffle X0 around so that each byte contains c
  1272  	MOVD AX, X0
  1273  	PUNPCKLBW X0, X0
  1274  	PUNPCKLBW X0, X0
  1275  	PSHUFL $0, X0, X0
  1276  	JMP condition
  1277  
  1278  sse:
  1279  	// move the next 16-byte chunk of the buffer into X1
  1280  	MOVO (DI), X1
  1281  	// compare bytes in X0 to X1
  1282  	PCMPEQB X0, X1
  1283  	// take the top bit of each byte in X1 and put the result in DX
  1284  	PMOVMSKB X1, DX
  1285  	TESTL DX, DX
  1286  	JNZ ssesuccess
  1287  	ADDQ $16, DI
  1288  
  1289  condition:
  1290  	CMPQ DI, R11
  1291  	JLT sse
  1292  
  1293  	// search the end
  1294  	MOVQ SI, CX
  1295  	ADDQ BX, CX
  1296  	SUBQ R11, CX
  1297  	// if CX == 0, the zero flag will be set and we'll end up
  1298  	// returning a false success
  1299  	JZ failure
  1300  	REPN; SCASB
  1301  	JZ success
  1302  
  1303  failure:
  1304  	MOVQ $-1, AX
  1305  	RET
  1306  
  1307  // handle for lengths < 16
  1308  indexbyte_small:
  1309  	MOVQ BX, CX
  1310  	REPN; SCASB
  1311  	JZ success
  1312  	MOVQ $-1, AX
  1313  	RET
  1314  
  1315  // we've found the chunk containing the byte
  1316  // now just figure out which specific byte it is
  1317  ssesuccess:
  1318  	// get the index of the least significant set bit
  1319  	BSFW DX, DX
  1320  	SUBQ SI, DI
  1321  	ADDQ DI, DX
  1322  	MOVQ DX, AX
  1323  	RET
  1324  
  1325  success:
  1326  	SUBQ SI, DI
  1327  	SUBL $1, DI
  1328  	MOVQ DI, AX
  1329  	RET
  1330  
  1331  TEXT bytes·Equal(SB),NOSPLIT,$0-49
  1332  	MOVQ	a_len+8(FP), BX
  1333  	MOVQ	b_len+32(FP), CX
  1334  	XORQ	AX, AX
  1335  	CMPQ	BX, CX
  1336  	JNE	eqret
  1337  	MOVQ	a+0(FP), SI
  1338  	MOVQ	b+24(FP), DI
  1339  	CALL	runtime·memeqbody(SB)
  1340  eqret:
  1341  	MOVB	AX, ret+48(FP)
  1342  	RET