github.com/filosottile/go@v0.0.0-20170906193555-dbed9972d994/src/runtime/asm_amd64p32.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	MOVL	SP, CX
    15  	SUBL	$128, CX		// plenty of scratch
    16  	ANDL	$~15, CX
    17  	MOVL	CX, SP
    18  
    19  	MOVL	AX, 16(SP)
    20  	MOVL	BX, 24(SP)
    21  	
    22  	// create istack out of the given (operating system) stack.
    23  	MOVL	$runtime·g0(SB), DI
    24  	LEAL	(-64*1024+104)(SP), BX
    25  	MOVL	BX, g_stackguard0(DI)
    26  	MOVL	BX, g_stackguard1(DI)
    27  	MOVL	BX, (g_stack+stack_lo)(DI)
    28  	MOVL	SP, (g_stack+stack_hi)(DI)
    29  
    30  	// find out information about the processor we're on
    31  	MOVL	$0, AX
    32  	CPUID
    33  	CMPL	AX, $0
    34  	JE	nocpuinfo
    35  
    36  	CMPL	BX, $0x756E6547  // "Genu"
    37  	JNE	notintel
    38  	CMPL	DX, $0x49656E69  // "ineI"
    39  	JNE	notintel
    40  	CMPL	CX, $0x6C65746E  // "ntel"
    41  	JNE	notintel
    42  	MOVB	$1, runtime·isIntel(SB)
    43  notintel:
    44  
    45  	// Load EAX=1 cpuid flags
    46  	MOVL	$1, AX
    47  	CPUID
    48  	MOVL	AX, runtime·processorVersionInfo(SB)
    49  
    50  	TESTL	$(1<<26), DX // SSE2
    51  	SETNE	runtime·support_sse2(SB)
    52  
    53  	TESTL	$(1<<9), CX // SSSE3
    54  	SETNE	runtime·support_ssse3(SB)
    55  
    56  	TESTL	$(1<<19), CX // SSE4.1
    57  	SETNE	runtime·support_sse41(SB)
    58  
    59  	TESTL	$(1<<20), CX // SSE4.2
    60  	SETNE	runtime·support_sse42(SB)
    61  
    62  	TESTL	$(1<<23), CX // POPCNT
    63  	SETNE	runtime·support_popcnt(SB)
    64  
    65  	TESTL	$(1<<25), CX // AES
    66  	SETNE	runtime·support_aes(SB)
    67  
    68  	TESTL	$(1<<27), CX // OSXSAVE
    69  	SETNE	runtime·support_osxsave(SB)
    70  
    71  	// If OS support for XMM and YMM is not present
    72  	// support_avx will be set back to false later.
    73  	TESTL	$(1<<28), CX // AVX
    74  	SETNE	runtime·support_avx(SB)
    75  
    76  eax7:
    77  	// Load EAX=7/ECX=0 cpuid flags
    78  	CMPL	SI, $7
    79  	JLT	osavx
    80  	MOVL	$7, AX
    81  	MOVL	$0, CX
    82  	CPUID
    83  
    84  	TESTL	$(1<<3), BX // BMI1
    85  	SETNE	runtime·support_bmi1(SB)
    86  
    87  	// If OS support for XMM and YMM is not present
    88  	// support_avx2 will be set back to false later.
    89  	TESTL	$(1<<5), BX
    90  	SETNE	runtime·support_avx2(SB)
    91  
    92  	TESTL	$(1<<8), BX // BMI2
    93  	SETNE	runtime·support_bmi2(SB)
    94  
    95  	TESTL	$(1<<9), BX // ERMS
    96  	SETNE	runtime·support_erms(SB)
    97  
    98  osavx:
    99  	// nacl does not support XGETBV to test
   100  	// for XMM and YMM OS support.
   101  #ifndef GOOS_nacl
   102  	CMPB	runtime·support_osxsave(SB), $1
   103  	JNE	noavx
   104  	MOVL	$0, CX
   105  	// For XGETBV, OSXSAVE bit is required and sufficient
   106  	XGETBV
   107  	ANDL	$6, AX
   108  	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
   109  	JE nocpuinfo
   110  #endif
   111  noavx:
   112  	MOVB $0, runtime·support_avx(SB)
   113  	MOVB $0, runtime·support_avx2(SB)
   114  
   115  nocpuinfo:
   116  
   117  needtls:
   118  	LEAL	runtime·m0+m_tls(SB), DI
   119  	CALL	runtime·settls(SB)
   120  
   121  	// store through it, to make sure it works
   122  	get_tls(BX)
   123  	MOVQ	$0x123, g(BX)
   124  	MOVQ	runtime·m0+m_tls(SB), AX
   125  	CMPQ	AX, $0x123
   126  	JEQ 2(PC)
   127  	MOVL	AX, 0	// abort
   128  ok:
   129  	// set the per-goroutine and per-mach "registers"
   130  	get_tls(BX)
   131  	LEAL	runtime·g0(SB), CX
   132  	MOVL	CX, g(BX)
   133  	LEAL	runtime·m0(SB), AX
   134  
   135  	// save m->g0 = g0
   136  	MOVL	CX, m_g0(AX)
   137  	// save m0 to g0->m
   138  	MOVL	AX, g_m(CX)
   139  
   140  	CLD				// convention is D is always left cleared
   141  	CALL	runtime·check(SB)
   142  
   143  	MOVL	16(SP), AX		// copy argc
   144  	MOVL	AX, 0(SP)
   145  	MOVL	24(SP), AX		// copy argv
   146  	MOVL	AX, 4(SP)
   147  	CALL	runtime·args(SB)
   148  	CALL	runtime·osinit(SB)
   149  	CALL	runtime·schedinit(SB)
   150  
   151  	// create a new goroutine to start program
   152  	MOVL	$runtime·mainPC(SB), AX	// entry
   153  	MOVL	$0, 0(SP)
   154  	MOVL	AX, 4(SP)
   155  	CALL	runtime·newproc(SB)
   156  
   157  	// start this M
   158  	CALL	runtime·mstart(SB)
   159  
   160  	MOVL	$0xf1, 0xf1  // crash
   161  	RET
   162  
   163  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   164  GLOBL	runtime·mainPC(SB),RODATA,$4
   165  
   166  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   167  	INT $3
   168  	RET
   169  
   170  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   171  	// No per-thread init.
   172  	RET
   173  
   174  /*
   175   *  go-routine
   176   */
   177  
   178  // void gosave(Gobuf*)
   179  // save state in Gobuf; setjmp
   180  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   181  	MOVL	buf+0(FP), AX	// gobuf
   182  	LEAL	buf+0(FP), BX	// caller's SP
   183  	MOVL	BX, gobuf_sp(AX)
   184  	MOVL	0(SP), BX		// caller's PC
   185  	MOVL	BX, gobuf_pc(AX)
   186  	MOVQ	$0, gobuf_ret(AX)
   187  	// Assert ctxt is zero. See func save.
   188  	MOVL	gobuf_ctxt(AX), BX
   189  	TESTL	BX, BX
   190  	JZ	2(PC)
   191  	CALL	runtime·badctxt(SB)
   192  	get_tls(CX)
   193  	MOVL	g(CX), BX
   194  	MOVL	BX, gobuf_g(AX)
   195  	RET
   196  
   197  // void gogo(Gobuf*)
   198  // restore state from Gobuf; longjmp
   199  TEXT runtime·gogo(SB), NOSPLIT, $8-4
   200  	MOVL	buf+0(FP), BX		// gobuf
   201  
   202  	// If ctxt is not nil, invoke deletion barrier before overwriting.
   203  	MOVL	gobuf_ctxt(BX), DX
   204  	TESTL	DX, DX
   205  	JZ	nilctxt
   206  	LEAL	gobuf_ctxt(BX), AX
   207  	MOVL	AX, 0(SP)
   208  	MOVL	$0, 4(SP)
   209  	CALL	runtime·writebarrierptr_prewrite(SB)
   210  	MOVL	buf+0(FP), BX
   211  
   212  nilctxt:
   213  	MOVL	gobuf_g(BX), DX
   214  	MOVL	0(DX), CX		// make sure g != nil
   215  	get_tls(CX)
   216  	MOVL	DX, g(CX)
   217  	MOVL	gobuf_sp(BX), SP	// restore SP
   218  	MOVL	gobuf_ctxt(BX), DX
   219  	MOVQ	gobuf_ret(BX), AX
   220  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   221  	MOVQ	$0, gobuf_ret(BX)
   222  	MOVL	$0, gobuf_ctxt(BX)
   223  	MOVL	gobuf_pc(BX), BX
   224  	JMP	BX
   225  
   226  // func mcall(fn func(*g))
   227  // Switch to m->g0's stack, call fn(g).
   228  // Fn must never return. It should gogo(&g->sched)
   229  // to keep running g.
   230  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   231  	MOVL	fn+0(FP), DI
   232  	
   233  	get_tls(CX)
   234  	MOVL	g(CX), AX	// save state in g->sched
   235  	MOVL	0(SP), BX	// caller's PC
   236  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   237  	LEAL	fn+0(FP), BX	// caller's SP
   238  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   239  	MOVL	AX, (g_sched+gobuf_g)(AX)
   240  
   241  	// switch to m->g0 & its stack, call fn
   242  	MOVL	g(CX), BX
   243  	MOVL	g_m(BX), BX
   244  	MOVL	m_g0(BX), SI
   245  	CMPL	SI, AX	// if g == m->g0 call badmcall
   246  	JNE	3(PC)
   247  	MOVL	$runtime·badmcall(SB), AX
   248  	JMP	AX
   249  	MOVL	SI, g(CX)	// g = m->g0
   250  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   251  	PUSHQ	AX
   252  	MOVL	DI, DX
   253  	MOVL	0(DI), DI
   254  	CALL	DI
   255  	POPQ	AX
   256  	MOVL	$runtime·badmcall2(SB), AX
   257  	JMP	AX
   258  	RET
   259  
   260  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   261  // of the G stack. We need to distinguish the routine that
   262  // lives at the bottom of the G stack from the one that lives
   263  // at the top of the system stack because the one at the top of
   264  // the system stack terminates the stack walk (see topofstack()).
   265  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   266  	RET
   267  
   268  // func systemstack(fn func())
   269  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   270  	MOVL	fn+0(FP), DI	// DI = fn
   271  	get_tls(CX)
   272  	MOVL	g(CX), AX	// AX = g
   273  	MOVL	g_m(AX), BX	// BX = m
   274  
   275  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   276  	CMPL	AX, DX
   277  	JEQ	noswitch
   278  
   279  	MOVL	m_g0(BX), DX	// DX = g0
   280  	CMPL	AX, DX
   281  	JEQ	noswitch
   282  
   283  	MOVL	m_curg(BX), R8
   284  	CMPL	AX, R8
   285  	JEQ	switch
   286  	
   287  	// Not g0, not curg. Must be gsignal, but that's not allowed.
   288  	// Hide call from linker nosplit analysis.
   289  	MOVL	$runtime·badsystemstack(SB), AX
   290  	CALL	AX
   291  
   292  switch:
   293  	// save our state in g->sched. Pretend to
   294  	// be systemstack_switch if the G stack is scanned.
   295  	MOVL	$runtime·systemstack_switch(SB), SI
   296  	MOVL	SI, (g_sched+gobuf_pc)(AX)
   297  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   298  	MOVL	AX, (g_sched+gobuf_g)(AX)
   299  
   300  	// switch to g0
   301  	MOVL	DX, g(CX)
   302  	MOVL	(g_sched+gobuf_sp)(DX), SP
   303  
   304  	// call target function
   305  	MOVL	DI, DX
   306  	MOVL	0(DI), DI
   307  	CALL	DI
   308  
   309  	// switch back to g
   310  	get_tls(CX)
   311  	MOVL	g(CX), AX
   312  	MOVL	g_m(AX), BX
   313  	MOVL	m_curg(BX), AX
   314  	MOVL	AX, g(CX)
   315  	MOVL	(g_sched+gobuf_sp)(AX), SP
   316  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   317  	RET
   318  
   319  noswitch:
   320  	// already on m stack, just call directly
   321  	MOVL	DI, DX
   322  	MOVL	0(DI), DI
   323  	CALL	DI
   324  	RET
   325  
   326  /*
   327   * support for morestack
   328   */
   329  
   330  // Called during function prolog when more stack is needed.
   331  //
   332  // The traceback routines see morestack on a g0 as being
   333  // the top of a stack (for example, morestack calling newstack
   334  // calling the scheduler calling newm calling gc), so we must
   335  // record an argument size. For that purpose, it has no arguments.
   336  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   337  	get_tls(CX)
   338  	MOVL	g(CX), BX
   339  	MOVL	g_m(BX), BX
   340  
   341  	// Cannot grow scheduler stack (m->g0).
   342  	MOVL	m_g0(BX), SI
   343  	CMPL	g(CX), SI
   344  	JNE	3(PC)
   345  	CALL	runtime·badmorestackg0(SB)
   346  	MOVL	0, AX
   347  
   348  	// Cannot grow signal stack (m->gsignal).
   349  	MOVL	m_gsignal(BX), SI
   350  	CMPL	g(CX), SI
   351  	JNE	3(PC)
   352  	CALL	runtime·badmorestackgsignal(SB)
   353  	MOVL	0, AX
   354  
   355  	// Called from f.
   356  	// Set m->morebuf to f's caller.
   357  	MOVL	8(SP), AX	// f's caller's PC
   358  	MOVL	AX, (m_morebuf+gobuf_pc)(BX)
   359  	LEAL	16(SP), AX	// f's caller's SP
   360  	MOVL	AX, (m_morebuf+gobuf_sp)(BX)
   361  	get_tls(CX)
   362  	MOVL	g(CX), SI
   363  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   364  
   365  	// Set g->sched to context in f.
   366  	MOVL	0(SP), AX // f's PC
   367  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   368  	MOVL	SI, (g_sched+gobuf_g)(SI)
   369  	LEAL	8(SP), AX // f's SP
   370  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   371  	// newstack will fill gobuf.ctxt.
   372  
   373  	// Call newstack on m->g0's stack.
   374  	MOVL	m_g0(BX), BX
   375  	MOVL	BX, g(CX)
   376  	MOVL	(g_sched+gobuf_sp)(BX), SP
   377  	PUSHQ	DX	// ctxt argument
   378  	CALL	runtime·newstack(SB)
   379  	MOVL	$0, 0x1003	// crash if newstack returns
   380  	POPQ	DX	// keep balance check happy
   381  	RET
   382  
   383  // morestack trampolines
   384  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   385  	MOVL	$0, DX
   386  	JMP	runtime·morestack(SB)
   387  
   388  // reflectcall: call a function with the given argument list
   389  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   390  // we don't have variable-sized frames, so we use a small number
   391  // of constant-sized-frame functions to encode a few bits of size in the pc.
   392  // Caution: ugly multiline assembly macros in your future!
   393  
   394  #define DISPATCH(NAME,MAXSIZE)		\
   395  	CMPL	CX, $MAXSIZE;		\
   396  	JA	3(PC);			\
   397  	MOVL	$NAME(SB), AX;		\
   398  	JMP	AX
   399  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   400  
   401  TEXT reflect·call(SB), NOSPLIT, $0-0
   402  	JMP	·reflectcall(SB)
   403  
   404  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   405  	MOVLQZX argsize+12(FP), CX
   406  	DISPATCH(runtime·call16, 16)
   407  	DISPATCH(runtime·call32, 32)
   408  	DISPATCH(runtime·call64, 64)
   409  	DISPATCH(runtime·call128, 128)
   410  	DISPATCH(runtime·call256, 256)
   411  	DISPATCH(runtime·call512, 512)
   412  	DISPATCH(runtime·call1024, 1024)
   413  	DISPATCH(runtime·call2048, 2048)
   414  	DISPATCH(runtime·call4096, 4096)
   415  	DISPATCH(runtime·call8192, 8192)
   416  	DISPATCH(runtime·call16384, 16384)
   417  	DISPATCH(runtime·call32768, 32768)
   418  	DISPATCH(runtime·call65536, 65536)
   419  	DISPATCH(runtime·call131072, 131072)
   420  	DISPATCH(runtime·call262144, 262144)
   421  	DISPATCH(runtime·call524288, 524288)
   422  	DISPATCH(runtime·call1048576, 1048576)
   423  	DISPATCH(runtime·call2097152, 2097152)
   424  	DISPATCH(runtime·call4194304, 4194304)
   425  	DISPATCH(runtime·call8388608, 8388608)
   426  	DISPATCH(runtime·call16777216, 16777216)
   427  	DISPATCH(runtime·call33554432, 33554432)
   428  	DISPATCH(runtime·call67108864, 67108864)
   429  	DISPATCH(runtime·call134217728, 134217728)
   430  	DISPATCH(runtime·call268435456, 268435456)
   431  	DISPATCH(runtime·call536870912, 536870912)
   432  	DISPATCH(runtime·call1073741824, 1073741824)
   433  	MOVL	$runtime·badreflectcall(SB), AX
   434  	JMP	AX
   435  
   436  #define CALLFN(NAME,MAXSIZE)			\
   437  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   438  	NO_LOCAL_POINTERS;			\
   439  	/* copy arguments to stack */		\
   440  	MOVL	argptr+8(FP), SI;		\
   441  	MOVL	argsize+12(FP), CX;		\
   442  	MOVL	SP, DI;				\
   443  	REP;MOVSB;				\
   444  	/* call function */			\
   445  	MOVL	f+4(FP), DX;			\
   446  	MOVL	(DX), AX;			\
   447  	CALL	AX;				\
   448  	/* copy return values back */		\
   449  	MOVL	argtype+0(FP), DX;		\
   450  	MOVL	argptr+8(FP), DI;		\
   451  	MOVL	argsize+12(FP), CX;		\
   452  	MOVL	retoffset+16(FP), BX;		\
   453  	MOVL	SP, SI;				\
   454  	ADDL	BX, DI;				\
   455  	ADDL	BX, SI;				\
   456  	SUBL	BX, CX;				\
   457  	CALL	callRet<>(SB);			\
   458  	RET
   459  
   460  // callRet copies return values back at the end of call*. This is a
   461  // separate function so it can allocate stack space for the arguments
   462  // to reflectcallmove. It does not follow the Go ABI; it expects its
   463  // arguments in registers.
   464  TEXT callRet<>(SB), NOSPLIT, $16-0
   465  	MOVL	DX, 0(SP)
   466  	MOVL	DI, 4(SP)
   467  	MOVL	SI, 8(SP)
   468  	MOVL	CX, 12(SP)
   469  	CALL	runtime·reflectcallmove(SB)
   470  	RET
   471  
   472  CALLFN(·call16, 16)
   473  CALLFN(·call32, 32)
   474  CALLFN(·call64, 64)
   475  CALLFN(·call128, 128)
   476  CALLFN(·call256, 256)
   477  CALLFN(·call512, 512)
   478  CALLFN(·call1024, 1024)
   479  CALLFN(·call2048, 2048)
   480  CALLFN(·call4096, 4096)
   481  CALLFN(·call8192, 8192)
   482  CALLFN(·call16384, 16384)
   483  CALLFN(·call32768, 32768)
   484  CALLFN(·call65536, 65536)
   485  CALLFN(·call131072, 131072)
   486  CALLFN(·call262144, 262144)
   487  CALLFN(·call524288, 524288)
   488  CALLFN(·call1048576, 1048576)
   489  CALLFN(·call2097152, 2097152)
   490  CALLFN(·call4194304, 4194304)
   491  CALLFN(·call8388608, 8388608)
   492  CALLFN(·call16777216, 16777216)
   493  CALLFN(·call33554432, 33554432)
   494  CALLFN(·call67108864, 67108864)
   495  CALLFN(·call134217728, 134217728)
   496  CALLFN(·call268435456, 268435456)
   497  CALLFN(·call536870912, 536870912)
   498  CALLFN(·call1073741824, 1073741824)
   499  
   500  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   501  	MOVL	cycles+0(FP), AX
   502  again:
   503  	PAUSE
   504  	SUBL	$1, AX
   505  	JNZ	again
   506  	RET
   507  
   508  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   509  	// Stores are already ordered on x86, so this is just a
   510  	// compile barrier.
   511  	RET
   512  
   513  // void jmpdefer(fn, sp);
   514  // called from deferreturn.
   515  // 1. pop the caller
   516  // 2. sub 5 bytes from the callers return
   517  // 3. jmp to the argument
   518  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   519  	MOVL	fv+0(FP), DX
   520  	MOVL	argp+4(FP), BX
   521  	LEAL	-8(BX), SP	// caller sp after CALL
   522  	SUBL	$5, (SP)	// return to CALL again
   523  	MOVL	0(DX), BX
   524  	JMP	BX	// but first run the deferred function
   525  
   526  // func asmcgocall(fn, arg unsafe.Pointer) int32
   527  // Not implemented.
   528  TEXT runtime·asmcgocall(SB),NOSPLIT,$0-12
   529  	MOVL	0, AX
   530  	RET
   531  
   532  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   533  // Not implemented.
   534  TEXT runtime·cgocallback(SB),NOSPLIT,$0-16
   535  	MOVL	0, AX
   536  	RET
   537  
   538  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   539  // Not implemented.
   540  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$0-16
   541  	MOVL	0, AX
   542  	RET
   543  
   544  // void setg(G*); set g. for use by needm.
   545  // Not implemented.
   546  TEXT runtime·setg(SB), NOSPLIT, $0-4
   547  	MOVL	0, AX
   548  	RET
   549  
   550  // check that SP is in range [g->stack.lo, g->stack.hi)
   551  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   552  	get_tls(CX)
   553  	MOVL	g(CX), AX
   554  	CMPL	(g_stack+stack_hi)(AX), SP
   555  	JHI	2(PC)
   556  	MOVL	0, AX
   557  	CMPL	SP, (g_stack+stack_lo)(AX)
   558  	JHI	2(PC)
   559  	MOVL	0, AX
   560  	RET
   561  
   562  TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-8
   563  	MOVL	ptr+0(FP), DI
   564  	MOVL	n+4(FP), CX
   565  	MOVQ	CX, BX
   566  	ANDQ	$3, BX
   567  	SHRQ	$2, CX
   568  	MOVQ	$0, AX
   569  	CLD
   570  	REP
   571  	STOSL
   572  	MOVQ	BX, CX
   573  	REP
   574  	STOSB
   575  	// Note: we zero only 4 bytes at a time so that the tail is at most
   576  	// 3 bytes. That guarantees that we aren't zeroing pointers with STOSB.
   577  	// See issue 13160.
   578  	RET
   579  
   580  TEXT runtime·getcallerpc(SB),NOSPLIT,$8-12
   581  	MOVL	argp+0(FP),AX		// addr of first arg
   582  	MOVL	-8(AX),AX		// get calling pc
   583  	MOVL	AX, ret+8(FP)
   584  	RET
   585  
   586  // int64 runtime·cputicks(void)
   587  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   588  	RDTSC
   589  	SHLQ	$32, DX
   590  	ADDQ	DX, AX
   591  	MOVQ	AX, ret+0(FP)
   592  	RET
   593  
   594  // hash function using AES hardware instructions
   595  // For now, our one amd64p32 system (NaCl) does not
   596  // support using AES instructions, so have not bothered to
   597  // write the implementations. Can copy and adjust the ones
   598  // in asm_amd64.s when the time comes.
   599  
   600  TEXT runtime·aeshash(SB),NOSPLIT,$0-20
   601  	MOVL	AX, ret+16(FP)
   602  	RET
   603  
   604  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   605  	MOVL	AX, ret+8(FP)
   606  	RET
   607  
   608  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
   609  	MOVL	AX, ret+8(FP)
   610  	RET
   611  
   612  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
   613  	MOVL	AX, ret+8(FP)
   614  	RET
   615  
   616  // memequal(p, q unsafe.Pointer, size uintptr) bool
   617  TEXT runtime·memequal(SB),NOSPLIT,$0-17
   618  	MOVL	a+0(FP), SI
   619  	MOVL	b+4(FP), DI
   620  	CMPL	SI, DI
   621  	JEQ	eq
   622  	MOVL	size+8(FP), BX
   623  	CALL	runtime·memeqbody(SB)
   624  	MOVB	AX, ret+16(FP)
   625  	RET
   626  eq:
   627  	MOVB    $1, ret+16(FP)
   628  	RET
   629  
   630  // memequal_varlen(a, b unsafe.Pointer) bool
   631  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
   632  	MOVL    a+0(FP), SI
   633  	MOVL    b+4(FP), DI
   634  	CMPL    SI, DI
   635  	JEQ     eq
   636  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
   637  	CALL    runtime·memeqbody(SB)
   638  	MOVB    AX, ret+8(FP)
   639  	RET
   640  eq:
   641  	MOVB    $1, ret+8(FP)
   642  	RET
   643  
   644  // a in SI
   645  // b in DI
   646  // count in BX
   647  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
   648  	XORQ	AX, AX
   649  
   650  	CMPQ	BX, $8
   651  	JB	small
   652  	
   653  	// 64 bytes at a time using xmm registers
   654  hugeloop:
   655  	CMPQ	BX, $64
   656  	JB	bigloop
   657  	MOVOU	(SI), X0
   658  	MOVOU	(DI), X1
   659  	MOVOU	16(SI), X2
   660  	MOVOU	16(DI), X3
   661  	MOVOU	32(SI), X4
   662  	MOVOU	32(DI), X5
   663  	MOVOU	48(SI), X6
   664  	MOVOU	48(DI), X7
   665  	PCMPEQB	X1, X0
   666  	PCMPEQB	X3, X2
   667  	PCMPEQB	X5, X4
   668  	PCMPEQB	X7, X6
   669  	PAND	X2, X0
   670  	PAND	X6, X4
   671  	PAND	X4, X0
   672  	PMOVMSKB X0, DX
   673  	ADDQ	$64, SI
   674  	ADDQ	$64, DI
   675  	SUBQ	$64, BX
   676  	CMPL	DX, $0xffff
   677  	JEQ	hugeloop
   678  	RET
   679  
   680  	// 8 bytes at a time using 64-bit register
   681  bigloop:
   682  	CMPQ	BX, $8
   683  	JBE	leftover
   684  	MOVQ	(SI), CX
   685  	MOVQ	(DI), DX
   686  	ADDQ	$8, SI
   687  	ADDQ	$8, DI
   688  	SUBQ	$8, BX
   689  	CMPQ	CX, DX
   690  	JEQ	bigloop
   691  	RET
   692  
   693  	// remaining 0-8 bytes
   694  leftover:
   695  	ADDQ	BX, SI
   696  	ADDQ	BX, DI
   697  	MOVQ	-8(SI), CX
   698  	MOVQ	-8(DI), DX
   699  	CMPQ	CX, DX
   700  	SETEQ	AX
   701  	RET
   702  
   703  small:
   704  	CMPQ	BX, $0
   705  	JEQ	equal
   706  
   707  	LEAQ	0(BX*8), CX
   708  	NEGQ	CX
   709  
   710  	CMPB	SI, $0xf8
   711  	JA	si_high
   712  
   713  	// load at SI won't cross a page boundary.
   714  	MOVQ	(SI), SI
   715  	JMP	si_finish
   716  si_high:
   717  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
   718  	MOVQ	BX, DX
   719  	ADDQ	SI, DX
   720  	MOVQ	-8(DX), SI
   721  	SHRQ	CX, SI
   722  si_finish:
   723  
   724  	// same for DI.
   725  	CMPB	DI, $0xf8
   726  	JA	di_high
   727  	MOVQ	(DI), DI
   728  	JMP	di_finish
   729  di_high:
   730  	MOVQ	BX, DX
   731  	ADDQ	DI, DX
   732  	MOVQ	-8(DX), DI
   733  	SHRQ	CX, DI
   734  di_finish:
   735  
   736  	SUBQ	SI, DI
   737  	SHLQ	CX, DI
   738  equal:
   739  	SETEQ	AX
   740  	RET
   741  
   742  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
   743  	MOVL	s1_base+0(FP), SI
   744  	MOVL	s1_len+4(FP), BX
   745  	MOVL	s2_base+8(FP), DI
   746  	MOVL	s2_len+12(FP), DX
   747  	CALL	runtime·cmpbody(SB)
   748  	MOVL	AX, ret+16(FP)
   749  	RET
   750  
   751  TEXT bytes·Compare(SB),NOSPLIT,$0-28
   752  	MOVL	s1+0(FP), SI
   753  	MOVL	s1+4(FP), BX
   754  	MOVL	s2+12(FP), DI
   755  	MOVL	s2+16(FP), DX
   756  	CALL	runtime·cmpbody(SB)
   757  	MOVL	AX, res+24(FP)
   758  	RET
   759  
   760  // input:
   761  //   SI = a
   762  //   DI = b
   763  //   BX = alen
   764  //   DX = blen
   765  // output:
   766  //   AX = 1/0/-1
   767  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
   768  	CMPQ	SI, DI
   769  	JEQ	allsame
   770  	CMPQ	BX, DX
   771  	MOVQ	DX, R8
   772  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
   773  	CMPQ	R8, $8
   774  	JB	small
   775  
   776  loop:
   777  	CMPQ	R8, $16
   778  	JBE	_0through16
   779  	MOVOU	(SI), X0
   780  	MOVOU	(DI), X1
   781  	PCMPEQB X0, X1
   782  	PMOVMSKB X1, AX
   783  	XORQ	$0xffff, AX	// convert EQ to NE
   784  	JNE	diff16	// branch if at least one byte is not equal
   785  	ADDQ	$16, SI
   786  	ADDQ	$16, DI
   787  	SUBQ	$16, R8
   788  	JMP	loop
   789  	
   790  	// AX = bit mask of differences
   791  diff16:
   792  	BSFQ	AX, BX	// index of first byte that differs
   793  	XORQ	AX, AX
   794  	ADDQ	BX, SI
   795  	MOVB	(SI), CX
   796  	ADDQ	BX, DI
   797  	CMPB	CX, (DI)
   798  	SETHI	AX
   799  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
   800  	RET
   801  
   802  	// 0 through 16 bytes left, alen>=8, blen>=8
   803  _0through16:
   804  	CMPQ	R8, $8
   805  	JBE	_0through8
   806  	MOVQ	(SI), AX
   807  	MOVQ	(DI), CX
   808  	CMPQ	AX, CX
   809  	JNE	diff8
   810  _0through8:
   811  	ADDQ	R8, SI
   812  	ADDQ	R8, DI
   813  	MOVQ	-8(SI), AX
   814  	MOVQ	-8(DI), CX
   815  	CMPQ	AX, CX
   816  	JEQ	allsame
   817  
   818  	// AX and CX contain parts of a and b that differ.
   819  diff8:
   820  	BSWAPQ	AX	// reverse order of bytes
   821  	BSWAPQ	CX
   822  	XORQ	AX, CX
   823  	BSRQ	CX, CX	// index of highest bit difference
   824  	SHRQ	CX, AX	// move a's bit to bottom
   825  	ANDQ	$1, AX	// mask bit
   826  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
   827  	RET
   828  
   829  	// 0-7 bytes in common
   830  small:
   831  	LEAQ	(R8*8), CX	// bytes left -> bits left
   832  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
   833  	JEQ	allsame
   834  
   835  	// load bytes of a into high bytes of AX
   836  	CMPB	SI, $0xf8
   837  	JA	si_high
   838  	MOVQ	(SI), SI
   839  	JMP	si_finish
   840  si_high:
   841  	ADDQ	R8, SI
   842  	MOVQ	-8(SI), SI
   843  	SHRQ	CX, SI
   844  si_finish:
   845  	SHLQ	CX, SI
   846  
   847  	// load bytes of b in to high bytes of BX
   848  	CMPB	DI, $0xf8
   849  	JA	di_high
   850  	MOVQ	(DI), DI
   851  	JMP	di_finish
   852  di_high:
   853  	ADDQ	R8, DI
   854  	MOVQ	-8(DI), DI
   855  	SHRQ	CX, DI
   856  di_finish:
   857  	SHLQ	CX, DI
   858  
   859  	BSWAPQ	SI	// reverse order of bytes
   860  	BSWAPQ	DI
   861  	XORQ	SI, DI	// find bit differences
   862  	JEQ	allsame
   863  	BSRQ	DI, CX	// index of highest bit difference
   864  	SHRQ	CX, SI	// move a's bit to bottom
   865  	ANDQ	$1, SI	// mask bit
   866  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
   867  	RET
   868  
   869  allsame:
   870  	XORQ	AX, AX
   871  	XORQ	CX, CX
   872  	CMPQ	BX, DX
   873  	SETGT	AX	// 1 if alen > blen
   874  	SETEQ	CX	// 1 if alen == blen
   875  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
   876  	RET
   877  
   878  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
   879  	MOVL s+0(FP), SI
   880  	MOVL s_len+4(FP), BX
   881  	MOVB c+12(FP), AL
   882  	CALL runtime·indexbytebody(SB)
   883  	MOVL AX, ret+16(FP)
   884  	RET
   885  
   886  TEXT strings·IndexByte(SB),NOSPLIT,$0-20
   887  	MOVL s+0(FP), SI
   888  	MOVL s_len+4(FP), BX
   889  	MOVB c+8(FP), AL
   890  	CALL runtime·indexbytebody(SB)
   891  	MOVL AX, ret+16(FP)
   892  	RET
   893  
   894  // input:
   895  //   SI: data
   896  //   BX: data len
   897  //   AL: byte sought
   898  // output:
   899  //   AX
   900  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
   901  	MOVL SI, DI
   902  
   903  	CMPL BX, $16
   904  	JLT small
   905  
   906  	// round up to first 16-byte boundary
   907  	TESTL $15, SI
   908  	JZ aligned
   909  	MOVL SI, CX
   910  	ANDL $~15, CX
   911  	ADDL $16, CX
   912  
   913  	// search the beginning
   914  	SUBL SI, CX
   915  	REPN; SCASB
   916  	JZ success
   917  
   918  // DI is 16-byte aligned; get ready to search using SSE instructions
   919  aligned:
   920  	// round down to last 16-byte boundary
   921  	MOVL BX, R11
   922  	ADDL SI, R11
   923  	ANDL $~15, R11
   924  
   925  	// shuffle X0 around so that each byte contains c
   926  	MOVD AX, X0
   927  	PUNPCKLBW X0, X0
   928  	PUNPCKLBW X0, X0
   929  	PSHUFL $0, X0, X0
   930  	JMP condition
   931  
   932  sse:
   933  	// move the next 16-byte chunk of the buffer into X1
   934  	MOVO (DI), X1
   935  	// compare bytes in X0 to X1
   936  	PCMPEQB X0, X1
   937  	// take the top bit of each byte in X1 and put the result in DX
   938  	PMOVMSKB X1, DX
   939  	TESTL DX, DX
   940  	JNZ ssesuccess
   941  	ADDL $16, DI
   942  
   943  condition:
   944  	CMPL DI, R11
   945  	JLT sse
   946  
   947  	// search the end
   948  	MOVL SI, CX
   949  	ADDL BX, CX
   950  	SUBL R11, CX
   951  	// if CX == 0, the zero flag will be set and we'll end up
   952  	// returning a false success
   953  	JZ failure
   954  	REPN; SCASB
   955  	JZ success
   956  
   957  failure:
   958  	MOVL $-1, AX
   959  	RET
   960  
   961  // handle for lengths < 16
   962  small:
   963  	MOVL BX, CX
   964  	REPN; SCASB
   965  	JZ success
   966  	MOVL $-1, AX
   967  	RET
   968  
   969  // we've found the chunk containing the byte
   970  // now just figure out which specific byte it is
   971  ssesuccess:
   972  	// get the index of the least significant set bit
   973  	BSFW DX, DX
   974  	SUBL SI, DI
   975  	ADDL DI, DX
   976  	MOVL DX, AX
   977  	RET
   978  
   979  success:
   980  	SUBL SI, DI
   981  	SUBL $1, DI
   982  	MOVL DI, AX
   983  	RET
   984  
   985  TEXT bytes·Equal(SB),NOSPLIT,$0-25
   986  	MOVL	a_len+4(FP), BX
   987  	MOVL	b_len+16(FP), CX
   988  	XORL	AX, AX
   989  	CMPL	BX, CX
   990  	JNE	eqret
   991  	MOVL	a+0(FP), SI
   992  	MOVL	b+12(FP), DI
   993  	CALL	runtime·memeqbody(SB)
   994  eqret:
   995  	MOVB	AX, ret+24(FP)
   996  	RET
   997  
   998  TEXT runtime·return0(SB), NOSPLIT, $0
   999  	MOVL	$0, AX
  1000  	RET
  1001  
  1002  // The top-most function running on a goroutine
  1003  // returns to goexit+PCQuantum.
  1004  TEXT runtime·goexit(SB),NOSPLIT,$0-0
  1005  	BYTE	$0x90	// NOP
  1006  	CALL	runtime·goexit1(SB)	// does not return
  1007  	// traceback from goexit1 must hit code range of goexit
  1008  	BYTE	$0x90	// NOP
  1009  
  1010  TEXT ·checkASM(SB),NOSPLIT,$0-1
  1011  	MOVB	$1, ret+0(FP)
  1012  	RET