github.com/s1s1ty/go@v0.0.0-20180207192209-104445e3140f/src/runtime/asm_amd64p32.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "go_tls.h"
     7  #include "funcdata.h"
     8  #include "textflag.h"
     9  
    10  TEXT runtime·rt0_go(SB),NOSPLIT,$0
    11  	// copy arguments forward on an even stack
    12  	MOVL	argc+0(FP), AX
    13  	MOVL	argv+4(FP), BX
    14  	MOVL	SP, CX
    15  	SUBL	$128, CX		// plenty of scratch
    16  	ANDL	$~15, CX
    17  	MOVL	CX, SP
    18  
    19  	MOVL	AX, 16(SP)
    20  	MOVL	BX, 24(SP)
    21  	
    22  	// create istack out of the given (operating system) stack.
    23  	MOVL	$runtime·g0(SB), DI
    24  	LEAL	(-64*1024+104)(SP), BX
    25  	MOVL	BX, g_stackguard0(DI)
    26  	MOVL	BX, g_stackguard1(DI)
    27  	MOVL	BX, (g_stack+stack_lo)(DI)
    28  	MOVL	SP, (g_stack+stack_hi)(DI)
    29  
    30  	// find out information about the processor we're on
    31  	MOVL	$0, AX
    32  	CPUID
    33  	CMPL	AX, $0
    34  	JE	nocpuinfo
    35  
    36  	CMPL	BX, $0x756E6547  // "Genu"
    37  	JNE	notintel
    38  	CMPL	DX, $0x49656E69  // "ineI"
    39  	JNE	notintel
    40  	CMPL	CX, $0x6C65746E  // "ntel"
    41  	JNE	notintel
    42  	MOVB	$1, runtime·isIntel(SB)
    43  notintel:
    44  
    45  	// Load EAX=1 cpuid flags
    46  	MOVL	$1, AX
    47  	CPUID
    48  	MOVL	AX, runtime·processorVersionInfo(SB)
    49  
    50  	TESTL	$(1<<26), DX // SSE2
    51  	SETNE	runtime·support_sse2(SB)
    52  
    53  	TESTL	$(1<<9), CX // SSSE3
    54  	SETNE	runtime·support_ssse3(SB)
    55  
    56  	TESTL	$(1<<19), CX // SSE4.1
    57  	SETNE	runtime·support_sse41(SB)
    58  
    59  	TESTL	$(1<<20), CX // SSE4.2
    60  	SETNE	runtime·support_sse42(SB)
    61  
    62  	TESTL	$(1<<23), CX // POPCNT
    63  	SETNE	runtime·support_popcnt(SB)
    64  
    65  	TESTL	$(1<<25), CX // AES
    66  	SETNE	runtime·support_aes(SB)
    67  
    68  	TESTL	$(1<<27), CX // OSXSAVE
    69  	SETNE	runtime·support_osxsave(SB)
    70  
    71  	// If OS support for XMM and YMM is not present
    72  	// support_avx will be set back to false later.
    73  	TESTL	$(1<<28), CX // AVX
    74  	SETNE	runtime·support_avx(SB)
    75  
    76  eax7:
    77  	// Load EAX=7/ECX=0 cpuid flags
    78  	CMPL	SI, $7
    79  	JLT	osavx
    80  	MOVL	$7, AX
    81  	MOVL	$0, CX
    82  	CPUID
    83  
    84  	TESTL	$(1<<3), BX // BMI1
    85  	SETNE	runtime·support_bmi1(SB)
    86  
    87  	// If OS support for XMM and YMM is not present
    88  	// support_avx2 will be set back to false later.
    89  	TESTL	$(1<<5), BX
    90  	SETNE	runtime·support_avx2(SB)
    91  
    92  	TESTL	$(1<<8), BX // BMI2
    93  	SETNE	runtime·support_bmi2(SB)
    94  
    95  	TESTL	$(1<<9), BX // ERMS
    96  	SETNE	runtime·support_erms(SB)
    97  
    98  osavx:
    99  	// nacl does not support XGETBV to test
   100  	// for XMM and YMM OS support.
   101  #ifndef GOOS_nacl
   102  	CMPB	runtime·support_osxsave(SB), $1
   103  	JNE	noavx
   104  	MOVL	$0, CX
   105  	// For XGETBV, OSXSAVE bit is required and sufficient
   106  	XGETBV
   107  	ANDL	$6, AX
   108  	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
   109  	JE nocpuinfo
   110  #endif
   111  noavx:
   112  	MOVB $0, runtime·support_avx(SB)
   113  	MOVB $0, runtime·support_avx2(SB)
   114  
   115  nocpuinfo:
   116  
   117  needtls:
   118  	LEAL	runtime·m0+m_tls(SB), DI
   119  	CALL	runtime·settls(SB)
   120  
   121  	// store through it, to make sure it works
   122  	get_tls(BX)
   123  	MOVQ	$0x123, g(BX)
   124  	MOVQ	runtime·m0+m_tls(SB), AX
   125  	CMPQ	AX, $0x123
   126  	JEQ 2(PC)
   127  	MOVL	AX, 0	// abort
   128  ok:
   129  	// set the per-goroutine and per-mach "registers"
   130  	get_tls(BX)
   131  	LEAL	runtime·g0(SB), CX
   132  	MOVL	CX, g(BX)
   133  	LEAL	runtime·m0(SB), AX
   134  
   135  	// save m->g0 = g0
   136  	MOVL	CX, m_g0(AX)
   137  	// save m0 to g0->m
   138  	MOVL	AX, g_m(CX)
   139  
   140  	CLD				// convention is D is always left cleared
   141  	CALL	runtime·check(SB)
   142  
   143  	MOVL	16(SP), AX		// copy argc
   144  	MOVL	AX, 0(SP)
   145  	MOVL	24(SP), AX		// copy argv
   146  	MOVL	AX, 4(SP)
   147  	CALL	runtime·args(SB)
   148  	CALL	runtime·osinit(SB)
   149  	CALL	runtime·schedinit(SB)
   150  
   151  	// create a new goroutine to start program
   152  	MOVL	$runtime·mainPC(SB), AX	// entry
   153  	MOVL	$0, 0(SP)
   154  	MOVL	AX, 4(SP)
   155  	CALL	runtime·newproc(SB)
   156  
   157  	// start this M
   158  	CALL	runtime·mstart(SB)
   159  
   160  	MOVL	$0xf1, 0xf1  // crash
   161  	RET
   162  
   163  DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
   164  GLOBL	runtime·mainPC(SB),RODATA,$4
   165  
   166  TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
   167  	INT $3
   168  	RET
   169  
   170  TEXT runtime·asminit(SB),NOSPLIT,$0-0
   171  	// No per-thread init.
   172  	RET
   173  
   174  /*
   175   *  go-routine
   176   */
   177  
   178  // void gosave(Gobuf*)
   179  // save state in Gobuf; setjmp
   180  TEXT runtime·gosave(SB), NOSPLIT, $0-4
   181  	MOVL	buf+0(FP), AX	// gobuf
   182  	LEAL	buf+0(FP), BX	// caller's SP
   183  	MOVL	BX, gobuf_sp(AX)
   184  	MOVL	0(SP), BX		// caller's PC
   185  	MOVL	BX, gobuf_pc(AX)
   186  	MOVQ	$0, gobuf_ret(AX)
   187  	// Assert ctxt is zero. See func save.
   188  	MOVL	gobuf_ctxt(AX), BX
   189  	TESTL	BX, BX
   190  	JZ	2(PC)
   191  	CALL	runtime·badctxt(SB)
   192  	get_tls(CX)
   193  	MOVL	g(CX), BX
   194  	MOVL	BX, gobuf_g(AX)
   195  	RET
   196  
   197  // void gogo(Gobuf*)
   198  // restore state from Gobuf; longjmp
   199  TEXT runtime·gogo(SB), NOSPLIT, $8-4
   200  	MOVL	buf+0(FP), BX		// gobuf
   201  	MOVL	gobuf_g(BX), DX
   202  	MOVL	0(DX), CX		// make sure g != nil
   203  	get_tls(CX)
   204  	MOVL	DX, g(CX)
   205  	MOVL	gobuf_sp(BX), SP	// restore SP
   206  	MOVL	gobuf_ctxt(BX), DX
   207  	MOVQ	gobuf_ret(BX), AX
   208  	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
   209  	MOVQ	$0, gobuf_ret(BX)
   210  	MOVL	$0, gobuf_ctxt(BX)
   211  	MOVL	gobuf_pc(BX), BX
   212  	JMP	BX
   213  
   214  // func mcall(fn func(*g))
   215  // Switch to m->g0's stack, call fn(g).
   216  // Fn must never return. It should gogo(&g->sched)
   217  // to keep running g.
   218  TEXT runtime·mcall(SB), NOSPLIT, $0-4
   219  	MOVL	fn+0(FP), DI
   220  	
   221  	get_tls(CX)
   222  	MOVL	g(CX), AX	// save state in g->sched
   223  	MOVL	0(SP), BX	// caller's PC
   224  	MOVL	BX, (g_sched+gobuf_pc)(AX)
   225  	LEAL	fn+0(FP), BX	// caller's SP
   226  	MOVL	BX, (g_sched+gobuf_sp)(AX)
   227  	MOVL	AX, (g_sched+gobuf_g)(AX)
   228  
   229  	// switch to m->g0 & its stack, call fn
   230  	MOVL	g(CX), BX
   231  	MOVL	g_m(BX), BX
   232  	MOVL	m_g0(BX), SI
   233  	CMPL	SI, AX	// if g == m->g0 call badmcall
   234  	JNE	3(PC)
   235  	MOVL	$runtime·badmcall(SB), AX
   236  	JMP	AX
   237  	MOVL	SI, g(CX)	// g = m->g0
   238  	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
   239  	PUSHQ	AX
   240  	MOVL	DI, DX
   241  	MOVL	0(DI), DI
   242  	CALL	DI
   243  	POPQ	AX
   244  	MOVL	$runtime·badmcall2(SB), AX
   245  	JMP	AX
   246  	RET
   247  
   248  // systemstack_switch is a dummy routine that systemstack leaves at the bottom
   249  // of the G stack. We need to distinguish the routine that
   250  // lives at the bottom of the G stack from the one that lives
   251  // at the top of the system stack because the one at the top of
   252  // the system stack terminates the stack walk (see topofstack()).
   253  TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
   254  	RET
   255  
   256  // func systemstack(fn func())
   257  TEXT runtime·systemstack(SB), NOSPLIT, $0-4
   258  	MOVL	fn+0(FP), DI	// DI = fn
   259  	get_tls(CX)
   260  	MOVL	g(CX), AX	// AX = g
   261  	MOVL	g_m(AX), BX	// BX = m
   262  
   263  	MOVL	m_gsignal(BX), DX	// DX = gsignal
   264  	CMPL	AX, DX
   265  	JEQ	noswitch
   266  
   267  	MOVL	m_g0(BX), DX	// DX = g0
   268  	CMPL	AX, DX
   269  	JEQ	noswitch
   270  
   271  	MOVL	m_curg(BX), R8
   272  	CMPL	AX, R8
   273  	JEQ	switch
   274  	
   275  	// Not g0, not curg. Must be gsignal, but that's not allowed.
   276  	// Hide call from linker nosplit analysis.
   277  	MOVL	$runtime·badsystemstack(SB), AX
   278  	CALL	AX
   279  
   280  switch:
   281  	// save our state in g->sched. Pretend to
   282  	// be systemstack_switch if the G stack is scanned.
   283  	MOVL	$runtime·systemstack_switch(SB), SI
   284  	MOVL	SI, (g_sched+gobuf_pc)(AX)
   285  	MOVL	SP, (g_sched+gobuf_sp)(AX)
   286  	MOVL	AX, (g_sched+gobuf_g)(AX)
   287  
   288  	// switch to g0
   289  	MOVL	DX, g(CX)
   290  	MOVL	(g_sched+gobuf_sp)(DX), SP
   291  
   292  	// call target function
   293  	MOVL	DI, DX
   294  	MOVL	0(DI), DI
   295  	CALL	DI
   296  
   297  	// switch back to g
   298  	get_tls(CX)
   299  	MOVL	g(CX), AX
   300  	MOVL	g_m(AX), BX
   301  	MOVL	m_curg(BX), AX
   302  	MOVL	AX, g(CX)
   303  	MOVL	(g_sched+gobuf_sp)(AX), SP
   304  	MOVL	$0, (g_sched+gobuf_sp)(AX)
   305  	RET
   306  
   307  noswitch:
   308  	// already on m stack, just call directly
   309  	// Using a tail call here cleans up tracebacks since we won't stop
   310  	// at an intermediate systemstack.
   311  	MOVL	DI, DX
   312  	MOVL	0(DI), DI
   313  	JMP	DI
   314  
   315  /*
   316   * support for morestack
   317   */
   318  
   319  // Called during function prolog when more stack is needed.
   320  //
   321  // The traceback routines see morestack on a g0 as being
   322  // the top of a stack (for example, morestack calling newstack
   323  // calling the scheduler calling newm calling gc), so we must
   324  // record an argument size. For that purpose, it has no arguments.
   325  TEXT runtime·morestack(SB),NOSPLIT,$0-0
   326  	get_tls(CX)
   327  	MOVL	g(CX), BX
   328  	MOVL	g_m(BX), BX
   329  
   330  	// Cannot grow scheduler stack (m->g0).
   331  	MOVL	m_g0(BX), SI
   332  	CMPL	g(CX), SI
   333  	JNE	3(PC)
   334  	CALL	runtime·badmorestackg0(SB)
   335  	MOVL	0, AX
   336  
   337  	// Cannot grow signal stack (m->gsignal).
   338  	MOVL	m_gsignal(BX), SI
   339  	CMPL	g(CX), SI
   340  	JNE	3(PC)
   341  	CALL	runtime·badmorestackgsignal(SB)
   342  	MOVL	0, AX
   343  
   344  	// Called from f.
   345  	// Set m->morebuf to f's caller.
   346  	MOVL	8(SP), AX	// f's caller's PC
   347  	MOVL	AX, (m_morebuf+gobuf_pc)(BX)
   348  	LEAL	16(SP), AX	// f's caller's SP
   349  	MOVL	AX, (m_morebuf+gobuf_sp)(BX)
   350  	get_tls(CX)
   351  	MOVL	g(CX), SI
   352  	MOVL	SI, (m_morebuf+gobuf_g)(BX)
   353  
   354  	// Set g->sched to context in f.
   355  	MOVL	0(SP), AX // f's PC
   356  	MOVL	AX, (g_sched+gobuf_pc)(SI)
   357  	MOVL	SI, (g_sched+gobuf_g)(SI)
   358  	LEAL	8(SP), AX // f's SP
   359  	MOVL	AX, (g_sched+gobuf_sp)(SI)
   360  	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
   361  
   362  	// Call newstack on m->g0's stack.
   363  	MOVL	m_g0(BX), BX
   364  	MOVL	BX, g(CX)
   365  	MOVL	(g_sched+gobuf_sp)(BX), SP
   366  	CALL	runtime·newstack(SB)
   367  	MOVL	$0, 0x1003	// crash if newstack returns
   368  	RET
   369  
   370  // morestack trampolines
   371  TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
   372  	MOVL	$0, DX
   373  	JMP	runtime·morestack(SB)
   374  
   375  // reflectcall: call a function with the given argument list
   376  // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
   377  // we don't have variable-sized frames, so we use a small number
   378  // of constant-sized-frame functions to encode a few bits of size in the pc.
   379  // Caution: ugly multiline assembly macros in your future!
   380  
   381  #define DISPATCH(NAME,MAXSIZE)		\
   382  	CMPL	CX, $MAXSIZE;		\
   383  	JA	3(PC);			\
   384  	MOVL	$NAME(SB), AX;		\
   385  	JMP	AX
   386  // Note: can't just "JMP NAME(SB)" - bad inlining results.
   387  
   388  TEXT reflect·call(SB), NOSPLIT, $0-0
   389  	JMP	·reflectcall(SB)
   390  
   391  TEXT ·reflectcall(SB), NOSPLIT, $0-20
   392  	MOVLQZX argsize+12(FP), CX
   393  	DISPATCH(runtime·call16, 16)
   394  	DISPATCH(runtime·call32, 32)
   395  	DISPATCH(runtime·call64, 64)
   396  	DISPATCH(runtime·call128, 128)
   397  	DISPATCH(runtime·call256, 256)
   398  	DISPATCH(runtime·call512, 512)
   399  	DISPATCH(runtime·call1024, 1024)
   400  	DISPATCH(runtime·call2048, 2048)
   401  	DISPATCH(runtime·call4096, 4096)
   402  	DISPATCH(runtime·call8192, 8192)
   403  	DISPATCH(runtime·call16384, 16384)
   404  	DISPATCH(runtime·call32768, 32768)
   405  	DISPATCH(runtime·call65536, 65536)
   406  	DISPATCH(runtime·call131072, 131072)
   407  	DISPATCH(runtime·call262144, 262144)
   408  	DISPATCH(runtime·call524288, 524288)
   409  	DISPATCH(runtime·call1048576, 1048576)
   410  	DISPATCH(runtime·call2097152, 2097152)
   411  	DISPATCH(runtime·call4194304, 4194304)
   412  	DISPATCH(runtime·call8388608, 8388608)
   413  	DISPATCH(runtime·call16777216, 16777216)
   414  	DISPATCH(runtime·call33554432, 33554432)
   415  	DISPATCH(runtime·call67108864, 67108864)
   416  	DISPATCH(runtime·call134217728, 134217728)
   417  	DISPATCH(runtime·call268435456, 268435456)
   418  	DISPATCH(runtime·call536870912, 536870912)
   419  	DISPATCH(runtime·call1073741824, 1073741824)
   420  	MOVL	$runtime·badreflectcall(SB), AX
   421  	JMP	AX
   422  
   423  #define CALLFN(NAME,MAXSIZE)			\
   424  TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
   425  	NO_LOCAL_POINTERS;			\
   426  	/* copy arguments to stack */		\
   427  	MOVL	argptr+8(FP), SI;		\
   428  	MOVL	argsize+12(FP), CX;		\
   429  	MOVL	SP, DI;				\
   430  	REP;MOVSB;				\
   431  	/* call function */			\
   432  	MOVL	f+4(FP), DX;			\
   433  	MOVL	(DX), AX;			\
   434  	CALL	AX;				\
   435  	/* copy return values back */		\
   436  	MOVL	argtype+0(FP), DX;		\
   437  	MOVL	argptr+8(FP), DI;		\
   438  	MOVL	argsize+12(FP), CX;		\
   439  	MOVL	retoffset+16(FP), BX;		\
   440  	MOVL	SP, SI;				\
   441  	ADDL	BX, DI;				\
   442  	ADDL	BX, SI;				\
   443  	SUBL	BX, CX;				\
   444  	CALL	callRet<>(SB);			\
   445  	RET
   446  
   447  // callRet copies return values back at the end of call*. This is a
   448  // separate function so it can allocate stack space for the arguments
   449  // to reflectcallmove. It does not follow the Go ABI; it expects its
   450  // arguments in registers.
   451  TEXT callRet<>(SB), NOSPLIT, $16-0
   452  	MOVL	DX, 0(SP)
   453  	MOVL	DI, 4(SP)
   454  	MOVL	SI, 8(SP)
   455  	MOVL	CX, 12(SP)
   456  	CALL	runtime·reflectcallmove(SB)
   457  	RET
   458  
   459  CALLFN(·call16, 16)
   460  CALLFN(·call32, 32)
   461  CALLFN(·call64, 64)
   462  CALLFN(·call128, 128)
   463  CALLFN(·call256, 256)
   464  CALLFN(·call512, 512)
   465  CALLFN(·call1024, 1024)
   466  CALLFN(·call2048, 2048)
   467  CALLFN(·call4096, 4096)
   468  CALLFN(·call8192, 8192)
   469  CALLFN(·call16384, 16384)
   470  CALLFN(·call32768, 32768)
   471  CALLFN(·call65536, 65536)
   472  CALLFN(·call131072, 131072)
   473  CALLFN(·call262144, 262144)
   474  CALLFN(·call524288, 524288)
   475  CALLFN(·call1048576, 1048576)
   476  CALLFN(·call2097152, 2097152)
   477  CALLFN(·call4194304, 4194304)
   478  CALLFN(·call8388608, 8388608)
   479  CALLFN(·call16777216, 16777216)
   480  CALLFN(·call33554432, 33554432)
   481  CALLFN(·call67108864, 67108864)
   482  CALLFN(·call134217728, 134217728)
   483  CALLFN(·call268435456, 268435456)
   484  CALLFN(·call536870912, 536870912)
   485  CALLFN(·call1073741824, 1073741824)
   486  
   487  TEXT runtime·procyield(SB),NOSPLIT,$0-0
   488  	MOVL	cycles+0(FP), AX
   489  again:
   490  	PAUSE
   491  	SUBL	$1, AX
   492  	JNZ	again
   493  	RET
   494  
   495  TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
   496  	// Stores are already ordered on x86, so this is just a
   497  	// compile barrier.
   498  	RET
   499  
   500  // void jmpdefer(fn, sp);
   501  // called from deferreturn.
   502  // 1. pop the caller
   503  // 2. sub 5 bytes from the callers return
   504  // 3. jmp to the argument
   505  TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
   506  	MOVL	fv+0(FP), DX
   507  	MOVL	argp+4(FP), BX
   508  	LEAL	-8(BX), SP	// caller sp after CALL
   509  	SUBL	$5, (SP)	// return to CALL again
   510  	MOVL	0(DX), BX
   511  	JMP	BX	// but first run the deferred function
   512  
   513  // func asmcgocall(fn, arg unsafe.Pointer) int32
   514  // Not implemented.
   515  TEXT runtime·asmcgocall(SB),NOSPLIT,$0-12
   516  	MOVL	0, AX
   517  	RET
   518  
   519  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   520  // Not implemented.
   521  TEXT runtime·cgocallback(SB),NOSPLIT,$0-16
   522  	MOVL	0, AX
   523  	RET
   524  
   525  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   526  // Not implemented.
   527  TEXT ·cgocallback_gofunc(SB),NOSPLIT,$0-16
   528  	MOVL	0, AX
   529  	RET
   530  
   531  // void setg(G*); set g. for use by needm.
   532  // Not implemented.
   533  TEXT runtime·setg(SB), NOSPLIT, $0-4
   534  	MOVL	0, AX
   535  	RET
   536  
   537  // check that SP is in range [g->stack.lo, g->stack.hi)
   538  TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
   539  	get_tls(CX)
   540  	MOVL	g(CX), AX
   541  	CMPL	(g_stack+stack_hi)(AX), SP
   542  	JHI	2(PC)
   543  	MOVL	0, AX
   544  	CMPL	SP, (g_stack+stack_lo)(AX)
   545  	JHI	2(PC)
   546  	MOVL	0, AX
   547  	RET
   548  
   549  // int64 runtime·cputicks(void)
   550  TEXT runtime·cputicks(SB),NOSPLIT,$0-0
   551  	RDTSC
   552  	SHLQ	$32, DX
   553  	ADDQ	DX, AX
   554  	MOVQ	AX, ret+0(FP)
   555  	RET
   556  
   557  // hash function using AES hardware instructions
   558  // For now, our one amd64p32 system (NaCl) does not
   559  // support using AES instructions, so have not bothered to
   560  // write the implementations. Can copy and adjust the ones
   561  // in asm_amd64.s when the time comes.
   562  
   563  TEXT runtime·aeshash(SB),NOSPLIT,$0-20
   564  	MOVL	AX, ret+16(FP)
   565  	RET
   566  
   567  TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
   568  	MOVL	AX, ret+8(FP)
   569  	RET
   570  
   571  TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
   572  	MOVL	AX, ret+8(FP)
   573  	RET
   574  
   575  TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
   576  	MOVL	AX, ret+8(FP)
   577  	RET
   578  
   579  // memequal(p, q unsafe.Pointer, size uintptr) bool
   580  TEXT runtime·memequal(SB),NOSPLIT,$0-17
   581  	MOVL	a+0(FP), SI
   582  	MOVL	b+4(FP), DI
   583  	CMPL	SI, DI
   584  	JEQ	eq
   585  	MOVL	size+8(FP), BX
   586  	CALL	runtime·memeqbody(SB)
   587  	MOVB	AX, ret+16(FP)
   588  	RET
   589  eq:
   590  	MOVB    $1, ret+16(FP)
   591  	RET
   592  
   593  // memequal_varlen(a, b unsafe.Pointer) bool
   594  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
   595  	MOVL    a+0(FP), SI
   596  	MOVL    b+4(FP), DI
   597  	CMPL    SI, DI
   598  	JEQ     eq
   599  	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
   600  	CALL    runtime·memeqbody(SB)
   601  	MOVB    AX, ret+8(FP)
   602  	RET
   603  eq:
   604  	MOVB    $1, ret+8(FP)
   605  	RET
   606  
   607  // a in SI
   608  // b in DI
   609  // count in BX
   610  TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
   611  	XORQ	AX, AX
   612  
   613  	CMPQ	BX, $8
   614  	JB	small
   615  	
   616  	// 64 bytes at a time using xmm registers
   617  hugeloop:
   618  	CMPQ	BX, $64
   619  	JB	bigloop
   620  	MOVOU	(SI), X0
   621  	MOVOU	(DI), X1
   622  	MOVOU	16(SI), X2
   623  	MOVOU	16(DI), X3
   624  	MOVOU	32(SI), X4
   625  	MOVOU	32(DI), X5
   626  	MOVOU	48(SI), X6
   627  	MOVOU	48(DI), X7
   628  	PCMPEQB	X1, X0
   629  	PCMPEQB	X3, X2
   630  	PCMPEQB	X5, X4
   631  	PCMPEQB	X7, X6
   632  	PAND	X2, X0
   633  	PAND	X6, X4
   634  	PAND	X4, X0
   635  	PMOVMSKB X0, DX
   636  	ADDQ	$64, SI
   637  	ADDQ	$64, DI
   638  	SUBQ	$64, BX
   639  	CMPL	DX, $0xffff
   640  	JEQ	hugeloop
   641  	RET
   642  
   643  	// 8 bytes at a time using 64-bit register
   644  bigloop:
   645  	CMPQ	BX, $8
   646  	JBE	leftover
   647  	MOVQ	(SI), CX
   648  	MOVQ	(DI), DX
   649  	ADDQ	$8, SI
   650  	ADDQ	$8, DI
   651  	SUBQ	$8, BX
   652  	CMPQ	CX, DX
   653  	JEQ	bigloop
   654  	RET
   655  
   656  	// remaining 0-8 bytes
   657  leftover:
   658  	ADDQ	BX, SI
   659  	ADDQ	BX, DI
   660  	MOVQ	-8(SI), CX
   661  	MOVQ	-8(DI), DX
   662  	CMPQ	CX, DX
   663  	SETEQ	AX
   664  	RET
   665  
   666  small:
   667  	CMPQ	BX, $0
   668  	JEQ	equal
   669  
   670  	LEAQ	0(BX*8), CX
   671  	NEGQ	CX
   672  
   673  	CMPB	SI, $0xf8
   674  	JA	si_high
   675  
   676  	// load at SI won't cross a page boundary.
   677  	MOVQ	(SI), SI
   678  	JMP	si_finish
   679  si_high:
   680  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
   681  	MOVQ	BX, DX
   682  	ADDQ	SI, DX
   683  	MOVQ	-8(DX), SI
   684  	SHRQ	CX, SI
   685  si_finish:
   686  
   687  	// same for DI.
   688  	CMPB	DI, $0xf8
   689  	JA	di_high
   690  	MOVQ	(DI), DI
   691  	JMP	di_finish
   692  di_high:
   693  	MOVQ	BX, DX
   694  	ADDQ	DI, DX
   695  	MOVQ	-8(DX), DI
   696  	SHRQ	CX, DI
   697  di_finish:
   698  
   699  	SUBQ	SI, DI
   700  	SHLQ	CX, DI
   701  equal:
   702  	SETEQ	AX
   703  	RET
   704  
   705  TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
   706  	MOVL	s1_base+0(FP), SI
   707  	MOVL	s1_len+4(FP), BX
   708  	MOVL	s2_base+8(FP), DI
   709  	MOVL	s2_len+12(FP), DX
   710  	CALL	runtime·cmpbody(SB)
   711  	MOVL	AX, ret+16(FP)
   712  	RET
   713  
   714  TEXT bytes·Compare(SB),NOSPLIT,$0-28
   715  	MOVL	s1+0(FP), SI
   716  	MOVL	s1+4(FP), BX
   717  	MOVL	s2+12(FP), DI
   718  	MOVL	s2+16(FP), DX
   719  	CALL	runtime·cmpbody(SB)
   720  	MOVL	AX, res+24(FP)
   721  	RET
   722  
   723  // input:
   724  //   SI = a
   725  //   DI = b
   726  //   BX = alen
   727  //   DX = blen
   728  // output:
   729  //   AX = 1/0/-1
   730  TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
   731  	CMPQ	SI, DI
   732  	JEQ	allsame
   733  	CMPQ	BX, DX
   734  	MOVQ	DX, R8
   735  	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
   736  	CMPQ	R8, $8
   737  	JB	small
   738  
   739  loop:
   740  	CMPQ	R8, $16
   741  	JBE	_0through16
   742  	MOVOU	(SI), X0
   743  	MOVOU	(DI), X1
   744  	PCMPEQB X0, X1
   745  	PMOVMSKB X1, AX
   746  	XORQ	$0xffff, AX	// convert EQ to NE
   747  	JNE	diff16	// branch if at least one byte is not equal
   748  	ADDQ	$16, SI
   749  	ADDQ	$16, DI
   750  	SUBQ	$16, R8
   751  	JMP	loop
   752  	
   753  	// AX = bit mask of differences
   754  diff16:
   755  	BSFQ	AX, BX	// index of first byte that differs
   756  	XORQ	AX, AX
   757  	ADDQ	BX, SI
   758  	MOVB	(SI), CX
   759  	ADDQ	BX, DI
   760  	CMPB	CX, (DI)
   761  	SETHI	AX
   762  	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
   763  	RET
   764  
   765  	// 0 through 16 bytes left, alen>=8, blen>=8
   766  _0through16:
   767  	CMPQ	R8, $8
   768  	JBE	_0through8
   769  	MOVQ	(SI), AX
   770  	MOVQ	(DI), CX
   771  	CMPQ	AX, CX
   772  	JNE	diff8
   773  _0through8:
   774  	ADDQ	R8, SI
   775  	ADDQ	R8, DI
   776  	MOVQ	-8(SI), AX
   777  	MOVQ	-8(DI), CX
   778  	CMPQ	AX, CX
   779  	JEQ	allsame
   780  
   781  	// AX and CX contain parts of a and b that differ.
   782  diff8:
   783  	BSWAPQ	AX	// reverse order of bytes
   784  	BSWAPQ	CX
   785  	XORQ	AX, CX
   786  	BSRQ	CX, CX	// index of highest bit difference
   787  	SHRQ	CX, AX	// move a's bit to bottom
   788  	ANDQ	$1, AX	// mask bit
   789  	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
   790  	RET
   791  
   792  	// 0-7 bytes in common
   793  small:
   794  	LEAQ	(R8*8), CX	// bytes left -> bits left
   795  	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
   796  	JEQ	allsame
   797  
   798  	// load bytes of a into high bytes of AX
   799  	CMPB	SI, $0xf8
   800  	JA	si_high
   801  	MOVQ	(SI), SI
   802  	JMP	si_finish
   803  si_high:
   804  	ADDQ	R8, SI
   805  	MOVQ	-8(SI), SI
   806  	SHRQ	CX, SI
   807  si_finish:
   808  	SHLQ	CX, SI
   809  
   810  	// load bytes of b in to high bytes of BX
   811  	CMPB	DI, $0xf8
   812  	JA	di_high
   813  	MOVQ	(DI), DI
   814  	JMP	di_finish
   815  di_high:
   816  	ADDQ	R8, DI
   817  	MOVQ	-8(DI), DI
   818  	SHRQ	CX, DI
   819  di_finish:
   820  	SHLQ	CX, DI
   821  
   822  	BSWAPQ	SI	// reverse order of bytes
   823  	BSWAPQ	DI
   824  	XORQ	SI, DI	// find bit differences
   825  	JEQ	allsame
   826  	BSRQ	DI, CX	// index of highest bit difference
   827  	SHRQ	CX, SI	// move a's bit to bottom
   828  	ANDQ	$1, SI	// mask bit
   829  	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
   830  	RET
   831  
   832  allsame:
   833  	XORQ	AX, AX
   834  	XORQ	CX, CX
   835  	CMPQ	BX, DX
   836  	SETGT	AX	// 1 if alen > blen
   837  	SETEQ	CX	// 1 if alen == blen
   838  	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
   839  	RET
   840  
   841  TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
   842  	MOVL s+0(FP), SI
   843  	MOVL s_len+4(FP), BX
   844  	MOVB c+12(FP), AL
   845  	CALL runtime·indexbytebody(SB)
   846  	MOVL AX, ret+16(FP)
   847  	RET
   848  
   849  TEXT strings·IndexByte(SB),NOSPLIT,$0-20
   850  	MOVL s+0(FP), SI
   851  	MOVL s_len+4(FP), BX
   852  	MOVB c+8(FP), AL
   853  	CALL runtime·indexbytebody(SB)
   854  	MOVL AX, ret+16(FP)
   855  	RET
   856  
   857  // input:
   858  //   SI: data
   859  //   BX: data len
   860  //   AL: byte sought
   861  // output:
   862  //   AX
   863  TEXT runtime·indexbytebody(SB),NOSPLIT,$0
   864  	MOVL SI, DI
   865  
   866  	CMPL BX, $16
   867  	JLT small
   868  
   869  	// round up to first 16-byte boundary
   870  	TESTL $15, SI
   871  	JZ aligned
   872  	MOVL SI, CX
   873  	ANDL $~15, CX
   874  	ADDL $16, CX
   875  
   876  	// search the beginning
   877  	SUBL SI, CX
   878  	REPN; SCASB
   879  	JZ success
   880  
   881  // DI is 16-byte aligned; get ready to search using SSE instructions
   882  aligned:
   883  	// round down to last 16-byte boundary
   884  	MOVL BX, R11
   885  	ADDL SI, R11
   886  	ANDL $~15, R11
   887  
   888  	// shuffle X0 around so that each byte contains c
   889  	MOVD AX, X0
   890  	PUNPCKLBW X0, X0
   891  	PUNPCKLBW X0, X0
   892  	PSHUFL $0, X0, X0
   893  	JMP condition
   894  
   895  sse:
   896  	// move the next 16-byte chunk of the buffer into X1
   897  	MOVO (DI), X1
   898  	// compare bytes in X0 to X1
   899  	PCMPEQB X0, X1
   900  	// take the top bit of each byte in X1 and put the result in DX
   901  	PMOVMSKB X1, DX
   902  	TESTL DX, DX
   903  	JNZ ssesuccess
   904  	ADDL $16, DI
   905  
   906  condition:
   907  	CMPL DI, R11
   908  	JLT sse
   909  
   910  	// search the end
   911  	MOVL SI, CX
   912  	ADDL BX, CX
   913  	SUBL R11, CX
   914  	// if CX == 0, the zero flag will be set and we'll end up
   915  	// returning a false success
   916  	JZ failure
   917  	REPN; SCASB
   918  	JZ success
   919  
   920  failure:
   921  	MOVL $-1, AX
   922  	RET
   923  
   924  // handle for lengths < 16
   925  small:
   926  	MOVL BX, CX
   927  	REPN; SCASB
   928  	JZ success
   929  	MOVL $-1, AX
   930  	RET
   931  
   932  // we've found the chunk containing the byte
   933  // now just figure out which specific byte it is
   934  ssesuccess:
   935  	// get the index of the least significant set bit
   936  	BSFW DX, DX
   937  	SUBL SI, DI
   938  	ADDL DI, DX
   939  	MOVL DX, AX
   940  	RET
   941  
   942  success:
   943  	SUBL SI, DI
   944  	SUBL $1, DI
   945  	MOVL DI, AX
   946  	RET
   947  
   948  TEXT bytes·Equal(SB),NOSPLIT,$0-25
   949  	MOVL	a_len+4(FP), BX
   950  	MOVL	b_len+16(FP), CX
   951  	XORL	AX, AX
   952  	CMPL	BX, CX
   953  	JNE	eqret
   954  	MOVL	a+0(FP), SI
   955  	MOVL	b+12(FP), DI
   956  	CALL	runtime·memeqbody(SB)
   957  eqret:
   958  	MOVB	AX, ret+24(FP)
   959  	RET
   960  
   961  TEXT runtime·return0(SB), NOSPLIT, $0
   962  	MOVL	$0, AX
   963  	RET
   964  
   965  // The top-most function running on a goroutine
   966  // returns to goexit+PCQuantum.
   967  TEXT runtime·goexit(SB),NOSPLIT,$0-0
   968  	BYTE	$0x90	// NOP
   969  	CALL	runtime·goexit1(SB)	// does not return
   970  	// traceback from goexit1 must hit code range of goexit
   971  	BYTE	$0x90	// NOP
   972  
   973  TEXT ·checkASM(SB),NOSPLIT,$0-1
   974  	MOVB	$1, ret+0(FP)
   975  	RET