github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/runtime/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "zasm_GOOS_GOARCH.h"
     6  
     7  TEXT _rt0_amd64(SB),7,$-8
     8  	// copy arguments forward on an even stack
     9  	MOVQ	DI, AX		// argc
    10  	MOVQ	SI, BX		// argv
    11  	SUBQ	$(4*8+7), SP		// 2args 2auto
    12  	ANDQ	$~15, SP
    13  	MOVQ	AX, 16(SP)
    14  	MOVQ	BX, 24(SP)
    15  	
    16  	// create istack out of the given (operating system) stack.
    17  	// _cgo_init may update stackguard.
    18  	MOVQ	$runtime·g0(SB), DI
    19  	LEAQ	(-64*1024+104)(SP), BX
    20  	MOVQ	BX, g_stackguard(DI)
    21  	MOVQ	SP, g_stackbase(DI)
    22  
    23  	// find out information about the processor we're on
    24  	MOVQ	$0, AX
    25  	CPUID
    26  	CMPQ	AX, $0
    27  	JE	nocpuinfo
    28  	MOVQ	$1, AX
    29  	CPUID
    30  	MOVL	CX, runtime·cpuid_ecx(SB)
    31  	MOVL	DX, runtime·cpuid_edx(SB)
    32  nocpuinfo:	
    33  	
    34  	// if there is an _cgo_init, call it.
    35  	MOVQ	_cgo_init(SB), AX
    36  	TESTQ	AX, AX
    37  	JZ	needtls
    38  	// g0 already in DI
    39  	MOVQ	DI, CX	// Win64 uses CX for first parameter
    40  	MOVQ	$setmg_gcc<>(SB), SI
    41  	CALL	AX
    42  	CMPL	runtime·iswindows(SB), $0
    43  	JEQ ok
    44  
    45  needtls:
    46  	// skip TLS setup on Plan 9
    47  	CMPL	runtime·isplan9(SB), $1
    48  	JEQ ok
    49  
    50  	LEAQ	runtime·tls0(SB), DI
    51  	CALL	runtime·settls(SB)
    52  
    53  	// store through it, to make sure it works
    54  	get_tls(BX)
    55  	MOVQ	$0x123, g(BX)
    56  	MOVQ	runtime·tls0(SB), AX
    57  	CMPQ	AX, $0x123
    58  	JEQ 2(PC)
    59  	MOVL	AX, 0	// abort
    60  ok:
    61  	// set the per-goroutine and per-mach "registers"
    62  	get_tls(BX)
    63  	LEAQ	runtime·g0(SB), CX
    64  	MOVQ	CX, g(BX)
    65  	LEAQ	runtime·m0(SB), AX
    66  	MOVQ	AX, m(BX)
    67  
    68  	// save m->g0 = g0
    69  	MOVQ	CX, m_g0(AX)
    70  
    71  	CLD				// convention is D is always left cleared
    72  	CALL	runtime·check(SB)
    73  
    74  	MOVL	16(SP), AX		// copy argc
    75  	MOVL	AX, 0(SP)
    76  	MOVQ	24(SP), AX		// copy argv
    77  	MOVQ	AX, 8(SP)
    78  	CALL	runtime·args(SB)
    79  	CALL	runtime·osinit(SB)
    80  	CALL	runtime·hashinit(SB)
    81  	CALL	runtime·schedinit(SB)
    82  
    83  	// create a new goroutine to start program
    84  	PUSHQ	$runtime·main·f(SB)		// entry
    85  	PUSHQ	$0			// arg size
    86  	CALL	runtime·newproc(SB)
    87  	POPQ	AX
    88  	POPQ	AX
    89  
    90  	// start this M
    91  	CALL	runtime·mstart(SB)
    92  
    93  	MOVL	$0xf1, 0xf1  // crash
    94  	RET
    95  
    96  DATA	runtime·main·f+0(SB)/8,$runtime·main(SB)
    97  GLOBL	runtime·main·f(SB),8,$8
    98  
    99  TEXT runtime·breakpoint(SB),7,$0
   100  	BYTE	$0xcc
   101  	RET
   102  
   103  TEXT runtime·asminit(SB),7,$0
   104  	// No per-thread init.
   105  	RET
   106  
   107  /*
   108   *  go-routine
   109   */
   110  
   111  // void gosave(Gobuf*)
   112  // save state in Gobuf; setjmp
   113  TEXT runtime·gosave(SB), 7, $0
   114  	MOVQ	8(SP), AX		// gobuf
   115  	LEAQ	8(SP), BX		// caller's SP
   116  	MOVQ	BX, gobuf_sp(AX)
   117  	MOVQ	0(SP), BX		// caller's PC
   118  	MOVQ	BX, gobuf_pc(AX)
   119  	get_tls(CX)
   120  	MOVQ	g(CX), BX
   121  	MOVQ	BX, gobuf_g(AX)
   122  	RET
   123  
   124  // void gogo(Gobuf*, uintptr)
   125  // restore state from Gobuf; longjmp
   126  TEXT runtime·gogo(SB), 7, $0
   127  	MOVQ	16(SP), AX		// return 2nd arg
   128  	MOVQ	8(SP), BX		// gobuf
   129  	MOVQ	gobuf_g(BX), DX
   130  	MOVQ	0(DX), CX		// make sure g != nil
   131  	get_tls(CX)
   132  	MOVQ	DX, g(CX)
   133  	MOVQ	gobuf_sp(BX), SP	// restore SP
   134  	MOVQ	gobuf_pc(BX), BX
   135  	JMP	BX
   136  
   137  // void gogocall(Gobuf*, void (*fn)(void), uintptr r0)
   138  // restore state from Gobuf but then call fn.
   139  // (call fn, returning to state in Gobuf)
   140  TEXT runtime·gogocall(SB), 7, $0
   141  	MOVQ	24(SP), DX	// context
   142  	MOVQ	16(SP), AX		// fn
   143  	MOVQ	8(SP), BX		// gobuf
   144  	MOVQ	gobuf_g(BX), DI
   145  	get_tls(CX)
   146  	MOVQ	DI, g(CX)
   147  	MOVQ	0(DI), CX	// make sure g != nil
   148  	MOVQ	gobuf_sp(BX), SP	// restore SP
   149  	MOVQ	gobuf_pc(BX), BX
   150  	PUSHQ	BX
   151  	JMP	AX
   152  	POPQ	BX	// not reached
   153  
   154  // void gogocallfn(Gobuf*, FuncVal*)
   155  // restore state from Gobuf but then call fn.
   156  // (call fn, returning to state in Gobuf)
   157  TEXT runtime·gogocallfn(SB), 7, $0
   158  	MOVQ	16(SP), DX		// fn
   159  	MOVQ	8(SP), BX		// gobuf
   160  	MOVQ	gobuf_g(BX), AX
   161  	get_tls(CX)
   162  	MOVQ	AX, g(CX)
   163  	MOVQ	0(AX), CX	// make sure g != nil
   164  	MOVQ	gobuf_sp(BX), SP	// restore SP
   165  	MOVQ	gobuf_pc(BX), BX
   166  	PUSHQ	BX
   167  	MOVQ	0(DX), BX
   168  	JMP	BX
   169  	POPQ	BX	// not reached
   170  
   171  // void mcall(void (*fn)(G*))
   172  // Switch to m->g0's stack, call fn(g).
   173  // Fn must never return.  It should gogo(&g->sched)
   174  // to keep running g.
   175  TEXT runtime·mcall(SB), 7, $0
   176  	MOVQ	fn+0(FP), DI
   177  	
   178  	get_tls(CX)
   179  	MOVQ	g(CX), AX	// save state in g->gobuf
   180  	MOVQ	0(SP), BX	// caller's PC
   181  	MOVQ	BX, (g_sched+gobuf_pc)(AX)
   182  	LEAQ	8(SP), BX	// caller's SP
   183  	MOVQ	BX, (g_sched+gobuf_sp)(AX)
   184  	MOVQ	AX, (g_sched+gobuf_g)(AX)
   185  
   186  	// switch to m->g0 & its stack, call fn
   187  	MOVQ	m(CX), BX
   188  	MOVQ	m_g0(BX), SI
   189  	CMPQ	SI, AX	// if g == m->g0 call badmcall
   190  	JNE	2(PC)
   191  	CALL	runtime·badmcall(SB)
   192  	MOVQ	SI, g(CX)	// g = m->g0
   193  	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->gobuf.sp
   194  	PUSHQ	AX
   195  	CALL	DI
   196  	POPQ	AX
   197  	CALL	runtime·badmcall2(SB)
   198  	RET
   199  
   200  /*
   201   * support for morestack
   202   */
   203  
   204  // Called during function prolog when more stack is needed.
   205  // Caller has already done get_tls(CX); MOVQ m(CX), BX.
   206  TEXT runtime·morestack(SB),7,$0
   207  	// Cannot grow scheduler stack (m->g0).
   208  	MOVQ	m_g0(BX), SI
   209  	CMPQ	g(CX), SI
   210  	JNE	2(PC)
   211  	INT	$3
   212  	
   213  	MOVQ	DX, m_cret(BX)
   214  
   215  	// Called from f.
   216  	// Set m->morebuf to f's caller.
   217  	MOVQ	8(SP), AX	// f's caller's PC
   218  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   219  	LEAQ	16(SP), AX	// f's caller's SP
   220  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   221  	MOVQ	AX, m_moreargp(BX)
   222  	get_tls(CX)
   223  	MOVQ	g(CX), SI
   224  	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
   225  
   226  	// Set m->morepc to f's PC.
   227  	MOVQ	0(SP), AX
   228  	MOVQ	AX, m_morepc(BX)
   229  
   230  	// Call newstack on m->g0's stack.
   231  	MOVQ	m_g0(BX), BP
   232  	MOVQ	BP, g(CX)
   233  	MOVQ	(g_sched+gobuf_sp)(BP), SP
   234  	CALL	runtime·newstack(SB)
   235  	MOVQ	$0, 0x1003	// crash if newstack returns
   236  	RET
   237  
   238  // Called from reflection library.  Mimics morestack,
   239  // reuses stack growth code to create a frame
   240  // with the desired args running the desired function.
   241  //
   242  // func call(fn *byte, arg *byte, argsize uint32).
   243  TEXT reflect·call(SB), 7, $0
   244  	get_tls(CX)
   245  	MOVQ	m(CX), BX
   246  
   247  	// Save our caller's state as the PC and SP to
   248  	// restore when returning from f.
   249  	MOVQ	0(SP), AX	// our caller's PC
   250  	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
   251  	LEAQ	8(SP), AX	// our caller's SP
   252  	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
   253  	MOVQ	g(CX), AX
   254  	MOVQ	AX, (m_morebuf+gobuf_g)(BX)
   255  
   256  	// Set up morestack arguments to call f on a new stack.
   257  	// We set f's frame size to 1, as a hint to newstack
   258  	// that this is a call from reflect·call.
   259  	// If it turns out that f needs a larger frame than
   260  	// the default stack, f's usual stack growth prolog will
   261  	// allocate a new segment (and recopy the arguments).
   262  	MOVQ	8(SP), AX	// fn
   263  	MOVQ	16(SP), DX	// arg frame
   264  	MOVL	24(SP), CX	// arg size
   265  
   266  	MOVQ	AX, m_morepc(BX)	// f's PC
   267  	MOVQ	DX, m_moreargp(BX)	// argument frame pointer
   268  	MOVL	CX, m_moreargsize(BX)	// f's argument size
   269  	MOVL	$1, m_moreframesize(BX)	// f's frame size
   270  
   271  	// Call newstack on m->g0's stack.
   272  	MOVQ	m_g0(BX), BP
   273  	get_tls(CX)
   274  	MOVQ	BP, g(CX)
   275  	MOVQ	(g_sched+gobuf_sp)(BP), SP
   276  	CALL	runtime·newstack(SB)
   277  	MOVQ	$0, 0x1103	// crash if newstack returns
   278  	RET
   279  
   280  // Return point when leaving stack.
   281  TEXT runtime·lessstack(SB), 7, $0
   282  	// Save return value in m->cret
   283  	get_tls(CX)
   284  	MOVQ	m(CX), BX
   285  	MOVQ	AX, m_cret(BX)
   286  
   287  	// Call oldstack on m->g0's stack.
   288  	MOVQ	m_g0(BX), BP
   289  	MOVQ	BP, g(CX)
   290  	MOVQ	(g_sched+gobuf_sp)(BP), SP
   291  	CALL	runtime·oldstack(SB)
   292  	MOVQ	$0, 0x1004	// crash if oldstack returns
   293  	RET
   294  
   295  // morestack trampolines
   296  TEXT runtime·morestack00(SB),7,$0
   297  	get_tls(CX)
   298  	MOVQ	m(CX), BX
   299  	MOVQ	$0, AX
   300  	MOVQ	AX, m_moreframesize(BX)
   301  	MOVQ	$runtime·morestack(SB), AX
   302  	JMP	AX
   303  
   304  TEXT runtime·morestack01(SB),7,$0
   305  	get_tls(CX)
   306  	MOVQ	m(CX), BX
   307  	SHLQ	$32, AX
   308  	MOVQ	AX, m_moreframesize(BX)
   309  	MOVQ	$runtime·morestack(SB), AX
   310  	JMP	AX
   311  
   312  TEXT runtime·morestack10(SB),7,$0
   313  	get_tls(CX)
   314  	MOVQ	m(CX), BX
   315  	MOVLQZX	AX, AX
   316  	MOVQ	AX, m_moreframesize(BX)
   317  	MOVQ	$runtime·morestack(SB), AX
   318  	JMP	AX
   319  
   320  TEXT runtime·morestack11(SB),7,$0
   321  	get_tls(CX)
   322  	MOVQ	m(CX), BX
   323  	MOVQ	AX, m_moreframesize(BX)
   324  	MOVQ	$runtime·morestack(SB), AX
   325  	JMP	AX
   326  
   327  // subcases of morestack01
   328  // with const of 8,16,...48
   329  TEXT runtime·morestack8(SB),7,$0
   330  	PUSHQ	$1
   331  	MOVQ	$morestack<>(SB), AX
   332  	JMP	AX
   333  
   334  TEXT runtime·morestack16(SB),7,$0
   335  	PUSHQ	$2
   336  	MOVQ	$morestack<>(SB), AX
   337  	JMP	AX
   338  
   339  TEXT runtime·morestack24(SB),7,$0
   340  	PUSHQ	$3
   341  	MOVQ	$morestack<>(SB), AX
   342  	JMP	AX
   343  
   344  TEXT runtime·morestack32(SB),7,$0
   345  	PUSHQ	$4
   346  	MOVQ	$morestack<>(SB), AX
   347  	JMP	AX
   348  
   349  TEXT runtime·morestack40(SB),7,$0
   350  	PUSHQ	$5
   351  	MOVQ	$morestack<>(SB), AX
   352  	JMP	AX
   353  
   354  TEXT runtime·morestack48(SB),7,$0
   355  	PUSHQ	$6
   356  	MOVQ	$morestack<>(SB), AX
   357  	JMP	AX
   358  
   359  TEXT morestack<>(SB),7,$0
   360  	get_tls(CX)
   361  	MOVQ	m(CX), BX
   362  	POPQ	AX
   363  	SHLQ	$35, AX
   364  	MOVQ	AX, m_moreframesize(BX)
   365  	MOVQ	$runtime·morestack(SB), AX
   366  	JMP	AX
   367  
   368  // bool cas(int32 *val, int32 old, int32 new)
   369  // Atomically:
   370  //	if(*val == old){
   371  //		*val = new;
   372  //		return 1;
   373  //	} else
   374  //		return 0;
   375  TEXT runtime·cas(SB), 7, $0
   376  	MOVQ	8(SP), BX
   377  	MOVL	16(SP), AX
   378  	MOVL	20(SP), CX
   379  	LOCK
   380  	CMPXCHGL	CX, 0(BX)
   381  	JZ 3(PC)
   382  	MOVL	$0, AX
   383  	RET
   384  	MOVL	$1, AX
   385  	RET
   386  
   387  // bool	runtime·cas64(uint64 *val, uint64 *old, uint64 new)
   388  // Atomically:
   389  //	if(*val == *old){
   390  //		*val = new;
   391  //		return 1;
   392  //	} else {
   393  //		*old = *val
   394  //		return 0;
   395  //	}
   396  TEXT runtime·cas64(SB), 7, $0
   397  	MOVQ	8(SP), BX
   398  	MOVQ	16(SP), BP
   399  	MOVQ	0(BP), AX
   400  	MOVQ	24(SP), CX
   401  	LOCK
   402  	CMPXCHGQ	CX, 0(BX)
   403  	JNZ	cas64_fail
   404  	MOVL	$1, AX
   405  	RET
   406  cas64_fail:
   407  	MOVQ	AX, 0(BP)
   408  	MOVL	$0, AX
   409  	RET
   410  
   411  // bool casp(void **val, void *old, void *new)
   412  // Atomically:
   413  //	if(*val == old){
   414  //		*val = new;
   415  //		return 1;
   416  //	} else
   417  //		return 0;
   418  TEXT runtime·casp(SB), 7, $0
   419  	MOVQ	8(SP), BX
   420  	MOVQ	16(SP), AX
   421  	MOVQ	24(SP), CX
   422  	LOCK
   423  	CMPXCHGQ	CX, 0(BX)
   424  	JZ 3(PC)
   425  	MOVL	$0, AX
   426  	RET
   427  	MOVL	$1, AX
   428  	RET
   429  
   430  // uint32 xadd(uint32 volatile *val, int32 delta)
   431  // Atomically:
   432  //	*val += delta;
   433  //	return *val;
   434  TEXT runtime·xadd(SB), 7, $0
   435  	MOVQ	8(SP), BX
   436  	MOVL	16(SP), AX
   437  	MOVL	AX, CX
   438  	LOCK
   439  	XADDL	AX, 0(BX)
   440  	ADDL	CX, AX
   441  	RET
   442  
   443  TEXT runtime·xadd64(SB), 7, $0
   444  	MOVQ	8(SP), BX
   445  	MOVQ	16(SP), AX
   446  	MOVQ	AX, CX
   447  	LOCK
   448  	XADDQ	AX, 0(BX)
   449  	ADDQ	CX, AX
   450  	RET
   451  
   452  TEXT runtime·xchg(SB), 7, $0
   453  	MOVQ	8(SP), BX
   454  	MOVL	16(SP), AX
   455  	XCHGL	AX, 0(BX)
   456  	RET
   457  
   458  TEXT runtime·xchg64(SB), 7, $0
   459  	MOVQ	8(SP), BX
   460  	MOVQ	16(SP), AX
   461  	XCHGQ	AX, 0(BX)
   462  	RET
   463  
   464  TEXT runtime·procyield(SB),7,$0
   465  	MOVL	8(SP), AX
   466  again:
   467  	PAUSE
   468  	SUBL	$1, AX
   469  	JNZ	again
   470  	RET
   471  
   472  TEXT runtime·atomicstorep(SB), 7, $0
   473  	MOVQ	8(SP), BX
   474  	MOVQ	16(SP), AX
   475  	XCHGQ	AX, 0(BX)
   476  	RET
   477  
   478  TEXT runtime·atomicstore(SB), 7, $0
   479  	MOVQ	8(SP), BX
   480  	MOVL	16(SP), AX
   481  	XCHGL	AX, 0(BX)
   482  	RET
   483  
   484  TEXT runtime·atomicstore64(SB), 7, $0
   485  	MOVQ	8(SP), BX
   486  	MOVQ	16(SP), AX
   487  	XCHGQ	AX, 0(BX)
   488  	RET
   489  
   490  // void jmpdefer(fn, sp);
   491  // called from deferreturn.
   492  // 1. pop the caller
   493  // 2. sub 5 bytes from the callers return
   494  // 3. jmp to the argument
   495  TEXT runtime·jmpdefer(SB), 7, $0
   496  	MOVQ	8(SP), DX	// fn
   497  	MOVQ	16(SP), BX	// caller sp
   498  	LEAQ	-8(BX), SP	// caller sp after CALL
   499  	SUBQ	$5, (SP)	// return to CALL again
   500  	MOVQ	0(DX), BX
   501  	JMP	BX	// but first run the deferred function
   502  
   503  // Dummy function to use in saved gobuf.PC,
   504  // to match SP pointing at a return address.
   505  // The gobuf.PC is unused by the contortions here
   506  // but setting it to return will make the traceback code work.
   507  TEXT return<>(SB),7,$0
   508  	RET
   509  
   510  // asmcgocall(void(*fn)(void*), void *arg)
   511  // Call fn(arg) on the scheduler stack,
   512  // aligned appropriately for the gcc ABI.
   513  // See cgocall.c for more details.
   514  TEXT runtime·asmcgocall(SB),7,$0
   515  	MOVQ	fn+0(FP), AX
   516  	MOVQ	arg+8(FP), BX
   517  	MOVQ	SP, DX
   518  
   519  	// Figure out if we need to switch to m->g0 stack.
   520  	// We get called to create new OS threads too, and those
   521  	// come in on the m->g0 stack already.
   522  	get_tls(CX)
   523  	MOVQ	m(CX), BP
   524  	MOVQ	m_g0(BP), SI
   525  	MOVQ	g(CX), DI
   526  	CMPQ	SI, DI
   527  	JEQ	6(PC)
   528  	MOVQ	SP, (g_sched+gobuf_sp)(DI)
   529  	MOVQ	$return<>(SB), (g_sched+gobuf_pc)(DI)
   530  	MOVQ	DI, (g_sched+gobuf_g)(DI)
   531  	MOVQ	SI, g(CX)
   532  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   533  
   534  	// Now on a scheduling stack (a pthread-created stack).
   535  	// Make sure we have enough room for 4 stack-backed fast-call
   536  	// registers as per windows amd64 calling convention.
   537  	SUBQ	$64, SP
   538  	ANDQ	$~15, SP	// alignment for gcc ABI
   539  	MOVQ	DI, 48(SP)	// save g
   540  	MOVQ	DX, 40(SP)	// save SP
   541  	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
   542  	MOVQ	BX, CX		// CX = first argument in Win64
   543  	CALL	AX
   544  
   545  	// Restore registers, g, stack pointer.
   546  	get_tls(CX)
   547  	MOVQ	48(SP), DI
   548  	MOVQ	DI, g(CX)
   549  	MOVQ	40(SP), SP
   550  	RET
   551  
   552  // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
   553  // Turn the fn into a Go func (by taking its address) and call
   554  // cgocallback_gofunc.
   555  TEXT runtime·cgocallback(SB),7,$24
   556  	LEAQ	fn+0(FP), AX
   557  	MOVQ	AX, 0(SP)
   558  	MOVQ	frame+8(FP), AX
   559  	MOVQ	AX, 8(SP)
   560  	MOVQ	framesize+16(FP), AX
   561  	MOVQ	AX, 16(SP)
   562  	MOVQ	$runtime·cgocallback_gofunc(SB), AX
   563  	CALL	AX
   564  	RET
   565  
   566  // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
   567  // See cgocall.c for more details.
   568  TEXT runtime·cgocallback_gofunc(SB),7,$24
   569  	// If m is nil, Go did not create the current thread.
   570  	// Call needm to obtain one for temporary use.
   571  	// In this case, we're running on the thread stack, so there's
   572  	// lots of space, but the linker doesn't know. Hide the call from
   573  	// the linker analysis by using an indirect call through AX.
   574  	get_tls(CX)
   575  #ifdef GOOS_windows
   576  	CMPQ	CX, $0
   577  	JNE	3(PC)
   578  	PUSHQ	$0
   579  	JMP	needm
   580  #endif
   581  	MOVQ	m(CX), BP
   582  	PUSHQ	BP
   583  	CMPQ	BP, $0
   584  	JNE	havem
   585  needm:
   586  	MOVQ	$runtime·needm(SB), AX
   587  	CALL	AX
   588  	get_tls(CX)
   589  	MOVQ	m(CX), BP
   590  
   591  havem:
   592  	// Now there's a valid m, and we're running on its m->g0.
   593  	// Save current m->g0->sched.sp on stack and then set it to SP.
   594  	// Save current sp in m->g0->sched.sp in preparation for
   595  	// switch back to m->curg stack.
   596  	MOVQ	m_g0(BP), SI
   597  	PUSHQ	(g_sched+gobuf_sp)(SI)
   598  	MOVQ	SP, (g_sched+gobuf_sp)(SI)
   599  
   600  	// Switch to m->curg stack and call runtime.cgocallbackg
   601  	// with the three arguments.  Because we are taking over
   602  	// the execution of m->curg but *not* resuming what had
   603  	// been running, we need to save that information (m->curg->gobuf)
   604  	// so that we can restore it when we're done. 
   605  	// We can restore m->curg->gobuf.sp easily, because calling
   606  	// runtime.cgocallbackg leaves SP unchanged upon return.
   607  	// To save m->curg->gobuf.pc, we push it onto the stack.
   608  	// This has the added benefit that it looks to the traceback
   609  	// routine like cgocallbackg is going to return to that
   610  	// PC (because we defined cgocallbackg to have
   611  	// a frame size of 24, the same amount that we use below),
   612  	// so that the traceback will seamlessly trace back into
   613  	// the earlier calls.
   614  	MOVQ	fn+0(FP), AX
   615  	MOVQ	frame+8(FP), BX
   616  	MOVQ	framesize+16(FP), DX
   617  
   618  	MOVQ	m_curg(BP), SI
   619  	MOVQ	SI, g(CX)
   620  	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
   621  
   622  	// Push gobuf.pc
   623  	MOVQ	(g_sched+gobuf_pc)(SI), BP
   624  	SUBQ	$8, DI
   625  	MOVQ	BP, 0(DI)
   626  
   627  	// Push arguments to cgocallbackg.
   628  	// Frame size here must match the frame size above
   629  	// to trick traceback routines into doing the right thing.
   630  	SUBQ	$24, DI
   631  	MOVQ	AX, 0(DI)
   632  	MOVQ	BX, 8(DI)
   633  	MOVQ	DX, 16(DI)
   634  	
   635  	// Switch stack and make the call.
   636  	MOVQ	DI, SP
   637  	CALL	runtime·cgocallbackg(SB)
   638  
   639  	// Restore g->gobuf (== m->curg->gobuf) from saved values.
   640  	get_tls(CX)
   641  	MOVQ	g(CX), SI
   642  	MOVQ	24(SP), BP
   643  	MOVQ	BP, (g_sched+gobuf_pc)(SI)
   644  	LEAQ	(24+8)(SP), DI
   645  	MOVQ	DI, (g_sched+gobuf_sp)(SI)
   646  
   647  	// Switch back to m->g0's stack and restore m->g0->sched.sp.
   648  	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
   649  	// so we do not have to restore it.)
   650  	MOVQ	m(CX), BP
   651  	MOVQ	m_g0(BP), SI
   652  	MOVQ	SI, g(CX)
   653  	MOVQ	(g_sched+gobuf_sp)(SI), SP
   654  	POPQ	(g_sched+gobuf_sp)(SI)
   655  	
   656  	// If the m on entry was nil, we called needm above to borrow an m
   657  	// for the duration of the call. Since the call is over, return it with dropm.
   658  	POPQ	BP
   659  	CMPQ	BP, $0
   660  	JNE 3(PC)
   661  	MOVQ	$runtime·dropm(SB), AX
   662  	CALL	AX
   663  
   664  	// Done!
   665  	RET
   666  
   667  // void setmg(M*, G*); set m and g. for use by needm.
   668  TEXT runtime·setmg(SB), 7, $0
   669  	MOVQ	mm+0(FP), AX
   670  #ifdef GOOS_windows
   671  	CMPQ	AX, $0
   672  	JNE	settls
   673  	MOVQ	$0, 0x28(GS)
   674  	RET
   675  settls:
   676  	LEAQ	m_tls(AX), AX
   677  	MOVQ	AX, 0x28(GS)
   678  #endif
   679  	get_tls(CX)
   680  	MOVQ	mm+0(FP), AX
   681  	MOVQ	AX, m(CX)
   682  	MOVQ	gg+8(FP), BX
   683  	MOVQ	BX, g(CX)
   684  	RET
   685  
   686  // void setmg_gcc(M*, G*); set m and g called from gcc.
   687  TEXT setmg_gcc<>(SB),7,$0
   688  	get_tls(AX)
   689  	MOVQ	DI, m(AX)
   690  	MOVQ	SI, g(AX)
   691  	RET
   692  
   693  // check that SP is in range [g->stackbase, g->stackguard)
   694  TEXT runtime·stackcheck(SB), 7, $0
   695  	get_tls(CX)
   696  	MOVQ	g(CX), AX
   697  	CMPQ	g_stackbase(AX), SP
   698  	JHI	2(PC)
   699  	INT	$3
   700  	CMPQ	SP, g_stackguard(AX)
   701  	JHI	2(PC)
   702  	INT	$3
   703  	RET
   704  
   705  TEXT runtime·memclr(SB),7,$0
   706  	MOVQ	8(SP), DI		// arg 1 addr
   707  	MOVQ	16(SP), CX		// arg 2 count
   708  	MOVQ	CX, BX
   709  	ANDQ	$7, BX
   710  	SHRQ	$3, CX
   711  	MOVQ	$0, AX
   712  	CLD
   713  	REP
   714  	STOSQ
   715  	MOVQ	BX, CX
   716  	REP
   717  	STOSB
   718  	RET
   719  
   720  TEXT runtime·getcallerpc(SB),7,$0
   721  	MOVQ	x+0(FP),AX		// addr of first arg
   722  	MOVQ	-8(AX),AX		// get calling pc
   723  	RET
   724  
   725  TEXT runtime·setcallerpc(SB),7,$0
   726  	MOVQ	x+0(FP),AX		// addr of first arg
   727  	MOVQ	x+8(FP), BX
   728  	MOVQ	BX, -8(AX)		// set calling pc
   729  	RET
   730  
   731  TEXT runtime·getcallersp(SB),7,$0
   732  	MOVQ	sp+0(FP), AX
   733  	RET
   734  
   735  // int64 runtime·cputicks(void)
   736  TEXT runtime·cputicks(SB),7,$0
   737  	RDTSC
   738  	SHLQ	$32, DX
   739  	ADDQ	DX, AX
   740  	RET
   741  
   742  TEXT runtime·stackguard(SB),7,$0
   743  	MOVQ	SP, DX
   744  	MOVQ	DX, sp+0(FP)
   745  	get_tls(CX)
   746  	MOVQ	g(CX), BX
   747  	MOVQ	g_stackguard(BX), DX
   748  	MOVQ	DX, limit+8(FP)
   749  	RET
   750  
   751  GLOBL runtime·tls0(SB), $64
   752  
   753  // hash function using AES hardware instructions
   754  TEXT runtime·aeshash(SB),7,$0
   755  	MOVQ	8(SP), DX	// ptr to hash value
   756  	MOVQ	16(SP), CX	// size
   757  	MOVQ	24(SP), AX	// ptr to data
   758  	JMP	runtime·aeshashbody(SB)
   759  
   760  TEXT runtime·aeshashstr(SB),7,$0
   761  	MOVQ	8(SP), DX	// ptr to hash value
   762  	MOVQ	24(SP), AX	// ptr to string struct
   763  	MOVQ	8(AX), CX	// length of string
   764  	MOVQ	(AX), AX	// string data
   765  	JMP	runtime·aeshashbody(SB)
   766  
   767  // AX: data
   768  // CX: length
   769  // DX: ptr to seed input / hash output
   770  TEXT runtime·aeshashbody(SB),7,$0
   771  	MOVQ	(DX), X0	// seed to low 64 bits of xmm0
   772  	PINSRQ	$1, CX, X0	// size to high 64 bits of xmm0
   773  	MOVO	runtime·aeskeysched+0(SB), X2
   774  	MOVO	runtime·aeskeysched+16(SB), X3
   775  aesloop:
   776  	CMPQ	CX, $16
   777  	JB	aesloopend
   778  	MOVOU	(AX), X1
   779  	AESENC	X2, X0
   780  	AESENC	X1, X0
   781  	SUBQ	$16, CX
   782  	ADDQ	$16, AX
   783  	JMP	aesloop
   784  aesloopend:
   785  	TESTQ	CX, CX
   786  	JE	finalize	// no partial block
   787  
   788  	TESTQ	$16, AX
   789  	JNE	highpartial
   790  
   791  	// address ends in 0xxxx.  16 bytes loaded
   792  	// at this address won't cross a page boundary, so
   793  	// we can load it directly.
   794  	MOVOU	(AX), X1
   795  	ADDQ	CX, CX
   796  	PAND	masks(SB)(CX*8), X1
   797  	JMP	partial
   798  highpartial:
   799  	// address ends in 1xxxx.  Might be up against
   800  	// a page boundary, so load ending at last byte.
   801  	// Then shift bytes down using pshufb.
   802  	MOVOU	-16(AX)(CX*1), X1
   803  	ADDQ	CX, CX
   804  	PSHUFB	shifts(SB)(CX*8), X1
   805  partial:
   806  	// incorporate partial block into hash
   807  	AESENC	X3, X0
   808  	AESENC	X1, X0
   809  finalize:	
   810  	// finalize hash
   811  	AESENC	X2, X0
   812  	AESENC	X3, X0
   813  	AESENC	X2, X0
   814  	MOVQ	X0, (DX)
   815  	RET
   816  
   817  TEXT runtime·aeshash32(SB),7,$0
   818  	MOVQ	8(SP), DX	// ptr to hash value
   819  	MOVQ	24(SP), AX	// ptr to data
   820  	MOVQ	(DX), X0	// seed
   821  	PINSRD	$2, (AX), X0	// data
   822  	AESENC	runtime·aeskeysched+0(SB), X0
   823  	AESENC	runtime·aeskeysched+16(SB), X0
   824  	AESENC	runtime·aeskeysched+0(SB), X0
   825  	MOVQ	X0, (DX)
   826  	RET
   827  
   828  TEXT runtime·aeshash64(SB),7,$0
   829  	MOVQ	8(SP), DX	// ptr to hash value
   830  	MOVQ	24(SP), AX	// ptr to data
   831  	MOVQ	(DX), X0	// seed
   832  	PINSRQ	$1, (AX), X0	// data
   833  	AESENC	runtime·aeskeysched+0(SB), X0
   834  	AESENC	runtime·aeskeysched+16(SB), X0
   835  	AESENC	runtime·aeskeysched+0(SB), X0
   836  	MOVQ	X0, (DX)
   837  	RET
   838  
   839  // simple mask to get rid of data in the high part of the register.
   840  TEXT masks(SB),7,$0
   841  	QUAD $0x0000000000000000
   842  	QUAD $0x0000000000000000
   843  	QUAD $0x00000000000000ff
   844  	QUAD $0x0000000000000000
   845  	QUAD $0x000000000000ffff
   846  	QUAD $0x0000000000000000
   847  	QUAD $0x0000000000ffffff
   848  	QUAD $0x0000000000000000
   849  	QUAD $0x00000000ffffffff
   850  	QUAD $0x0000000000000000
   851  	QUAD $0x000000ffffffffff
   852  	QUAD $0x0000000000000000
   853  	QUAD $0x0000ffffffffffff
   854  	QUAD $0x0000000000000000
   855  	QUAD $0x00ffffffffffffff
   856  	QUAD $0x0000000000000000
   857  	QUAD $0xffffffffffffffff
   858  	QUAD $0x0000000000000000
   859  	QUAD $0xffffffffffffffff
   860  	QUAD $0x00000000000000ff
   861  	QUAD $0xffffffffffffffff
   862  	QUAD $0x000000000000ffff
   863  	QUAD $0xffffffffffffffff
   864  	QUAD $0x0000000000ffffff
   865  	QUAD $0xffffffffffffffff
   866  	QUAD $0x00000000ffffffff
   867  	QUAD $0xffffffffffffffff
   868  	QUAD $0x000000ffffffffff
   869  	QUAD $0xffffffffffffffff
   870  	QUAD $0x0000ffffffffffff
   871  	QUAD $0xffffffffffffffff
   872  	QUAD $0x00ffffffffffffff
   873  
   874  	// these are arguments to pshufb.  They move data down from
   875  	// the high bytes of the register to the low bytes of the register.
   876  	// index is how many bytes to move.
   877  TEXT shifts(SB),7,$0
   878  	QUAD $0x0000000000000000
   879  	QUAD $0x0000000000000000
   880  	QUAD $0xffffffffffffff0f
   881  	QUAD $0xffffffffffffffff
   882  	QUAD $0xffffffffffff0f0e
   883  	QUAD $0xffffffffffffffff
   884  	QUAD $0xffffffffff0f0e0d
   885  	QUAD $0xffffffffffffffff
   886  	QUAD $0xffffffff0f0e0d0c
   887  	QUAD $0xffffffffffffffff
   888  	QUAD $0xffffff0f0e0d0c0b
   889  	QUAD $0xffffffffffffffff
   890  	QUAD $0xffff0f0e0d0c0b0a
   891  	QUAD $0xffffffffffffffff
   892  	QUAD $0xff0f0e0d0c0b0a09
   893  	QUAD $0xffffffffffffffff
   894  	QUAD $0x0f0e0d0c0b0a0908
   895  	QUAD $0xffffffffffffffff
   896  	QUAD $0x0e0d0c0b0a090807
   897  	QUAD $0xffffffffffffff0f
   898  	QUAD $0x0d0c0b0a09080706
   899  	QUAD $0xffffffffffff0f0e
   900  	QUAD $0x0c0b0a0908070605
   901  	QUAD $0xffffffffff0f0e0d
   902  	QUAD $0x0b0a090807060504
   903  	QUAD $0xffffffff0f0e0d0c
   904  	QUAD $0x0a09080706050403
   905  	QUAD $0xffffff0f0e0d0c0b
   906  	QUAD $0x0908070605040302
   907  	QUAD $0xffff0f0e0d0c0b0a
   908  	QUAD $0x0807060504030201
   909  	QUAD $0xff0f0e0d0c0b0a09
   910  
   911  TEXT runtime·memeq(SB),7,$0
   912  	MOVQ	a+0(FP), SI
   913  	MOVQ	b+8(FP), DI
   914  	MOVQ	count+16(FP), BX
   915  	JMP	runtime·memeqbody(SB)
   916  
   917  
   918  TEXT bytes·Equal(SB),7,$0
   919  	MOVQ	a_len+8(FP), BX
   920  	MOVQ	b_len+32(FP), CX
   921  	XORQ	AX, AX
   922  	CMPQ	BX, CX
   923  	JNE	eqret
   924  	MOVQ	a+0(FP), SI
   925  	MOVQ	b+24(FP), DI
   926  	CALL	runtime·memeqbody(SB)
   927  eqret:
   928  	MOVB	AX, ret+48(FP)
   929  	RET
   930  
   931  // a in SI
   932  // b in DI
   933  // count in BX
   934  TEXT runtime·memeqbody(SB),7,$0
   935  	XORQ	AX, AX
   936  
   937  	CMPQ	BX, $8
   938  	JB	small
   939  	
   940  	// 64 bytes at a time using xmm registers
   941  hugeloop:
   942  	CMPQ	BX, $64
   943  	JB	bigloop
   944  	MOVOU	(SI), X0
   945  	MOVOU	(DI), X1
   946  	MOVOU	16(SI), X2
   947  	MOVOU	16(DI), X3
   948  	MOVOU	32(SI), X4
   949  	MOVOU	32(DI), X5
   950  	MOVOU	48(SI), X6
   951  	MOVOU	48(DI), X7
   952  	PCMPEQB	X1, X0
   953  	PCMPEQB	X3, X2
   954  	PCMPEQB	X5, X4
   955  	PCMPEQB	X7, X6
   956  	PAND	X2, X0
   957  	PAND	X6, X4
   958  	PAND	X4, X0
   959  	PMOVMSKB X0, DX
   960  	ADDQ	$64, SI
   961  	ADDQ	$64, DI
   962  	SUBQ	$64, BX
   963  	CMPL	DX, $0xffff
   964  	JEQ	hugeloop
   965  	RET
   966  
   967  	// 8 bytes at a time using 64-bit register
   968  bigloop:
   969  	CMPQ	BX, $8
   970  	JBE	leftover
   971  	MOVQ	(SI), CX
   972  	MOVQ	(DI), DX
   973  	ADDQ	$8, SI
   974  	ADDQ	$8, DI
   975  	SUBQ	$8, BX
   976  	CMPQ	CX, DX
   977  	JEQ	bigloop
   978  	RET
   979  
   980  	// remaining 0-8 bytes
   981  leftover:
   982  	MOVQ	-8(SI)(BX*1), CX
   983  	MOVQ	-8(DI)(BX*1), DX
   984  	CMPQ	CX, DX
   985  	SETEQ	AX
   986  	RET
   987  
   988  small:
   989  	CMPQ	BX, $0
   990  	JEQ	equal
   991  
   992  	LEAQ	0(BX*8), CX
   993  	NEGQ	CX
   994  
   995  	CMPB	SI, $0xf8
   996  	JA	si_high
   997  
   998  	// load at SI won't cross a page boundary.
   999  	MOVQ	(SI), SI
  1000  	JMP	si_finish
  1001  si_high:
  1002  	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
  1003  	MOVQ	-8(SI)(BX*1), SI
  1004  	SHRQ	CX, SI
  1005  si_finish:
  1006  
  1007  	// same for DI.
  1008  	CMPB	DI, $0xf8
  1009  	JA	di_high
  1010  	MOVQ	(DI), DI
  1011  	JMP	di_finish
  1012  di_high:
  1013  	MOVQ	-8(DI)(BX*1), DI
  1014  	SHRQ	CX, DI
  1015  di_finish:
  1016  
  1017  	SUBQ	SI, DI
  1018  	SHLQ	CX, DI
  1019  equal:
  1020  	SETEQ	AX
  1021  	RET