github.com/peggyl/go@v0.0.0-20151008231540-ae315999c2d5/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVQ $0, AX 30 CPUID 31 CMPQ AX, $0 32 JE nocpuinfo 33 34 // Figure out how to serialize RDTSC. 35 // On Intel processors LFENCE is enough. AMD requires MFENCE. 36 // Don't know about the rest, so let's do MFENCE. 37 CMPL BX, $0x756E6547 // "Genu" 38 JNE notintel 39 CMPL DX, $0x49656E69 // "ineI" 40 JNE notintel 41 CMPL CX, $0x6C65746E // "ntel" 42 JNE notintel 43 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 44 notintel: 45 46 MOVQ $1, AX 47 CPUID 48 MOVL CX, runtime·cpuid_ecx(SB) 49 MOVL DX, runtime·cpuid_edx(SB) 50 nocpuinfo: 51 52 // if there is an _cgo_init, call it. 53 MOVQ _cgo_init(SB), AX 54 TESTQ AX, AX 55 JZ needtls 56 // g0 already in DI 57 MOVQ DI, CX // Win64 uses CX for first parameter 58 MOVQ $setg_gcc<>(SB), SI 59 CALL AX 60 61 // update stackguard after _cgo_init 62 MOVQ $runtime·g0(SB), CX 63 MOVQ (g_stack+stack_lo)(CX), AX 64 ADDQ $const__StackGuard, AX 65 MOVQ AX, g_stackguard0(CX) 66 MOVQ AX, g_stackguard1(CX) 67 68 CMPL runtime·iswindows(SB), $0 69 JEQ ok 70 needtls: 71 // skip TLS setup on Plan 9 72 CMPL runtime·isplan9(SB), $1 73 JEQ ok 74 // skip TLS setup on Solaris 75 CMPL runtime·issolaris(SB), $1 76 JEQ ok 77 78 LEAQ runtime·tls0(SB), DI 79 CALL runtime·settls(SB) 80 81 // store through it, to make sure it works 82 get_tls(BX) 83 MOVQ $0x123, g(BX) 84 MOVQ runtime·tls0(SB), AX 85 CMPQ AX, $0x123 86 JEQ 2(PC) 87 MOVL AX, 0 // abort 88 ok: 89 // set the per-goroutine and per-mach "registers" 90 get_tls(BX) 91 LEAQ runtime·g0(SB), CX 92 MOVQ CX, g(BX) 93 LEAQ runtime·m0(SB), AX 94 95 // save m->g0 = g0 96 MOVQ CX, m_g0(AX) 97 // save m0 to g0->m 98 MOVQ AX, g_m(CX) 99 100 CLD // convention is D is always left cleared 101 CALL runtime·check(SB) 102 103 MOVL 16(SP), AX // copy argc 104 MOVL AX, 0(SP) 105 MOVQ 24(SP), AX // copy argv 106 MOVQ AX, 8(SP) 107 CALL runtime·args(SB) 108 CALL runtime·osinit(SB) 109 CALL runtime·schedinit(SB) 110 111 // create a new goroutine to start program 112 MOVQ $runtime·mainPC(SB), AX // entry 113 PUSHQ AX 114 PUSHQ $0 // arg size 115 CALL runtime·newproc(SB) 116 POPQ AX 117 POPQ AX 118 119 // start this M 120 CALL runtime·mstart(SB) 121 122 MOVL $0xf1, 0xf1 // crash 123 RET 124 125 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 126 GLOBL runtime·mainPC(SB),RODATA,$8 127 128 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 129 BYTE $0xcc 130 RET 131 132 TEXT runtime·asminit(SB),NOSPLIT,$0-0 133 // No per-thread init. 134 RET 135 136 /* 137 * go-routine 138 */ 139 140 // void gosave(Gobuf*) 141 // save state in Gobuf; setjmp 142 TEXT runtime·gosave(SB), NOSPLIT, $0-8 143 MOVQ buf+0(FP), AX // gobuf 144 LEAQ buf+0(FP), BX // caller's SP 145 MOVQ BX, gobuf_sp(AX) 146 MOVQ 0(SP), BX // caller's PC 147 MOVQ BX, gobuf_pc(AX) 148 MOVQ $0, gobuf_ret(AX) 149 MOVQ $0, gobuf_ctxt(AX) 150 MOVQ BP, gobuf_bp(AX) 151 get_tls(CX) 152 MOVQ g(CX), BX 153 MOVQ BX, gobuf_g(AX) 154 RET 155 156 // void gogo(Gobuf*) 157 // restore state from Gobuf; longjmp 158 TEXT runtime·gogo(SB), NOSPLIT, $0-8 159 MOVQ buf+0(FP), BX // gobuf 160 MOVQ gobuf_g(BX), DX 161 MOVQ 0(DX), CX // make sure g != nil 162 get_tls(CX) 163 MOVQ DX, g(CX) 164 MOVQ gobuf_sp(BX), SP // restore SP 165 MOVQ gobuf_ret(BX), AX 166 MOVQ gobuf_ctxt(BX), DX 167 MOVQ gobuf_bp(BX), BP 168 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 169 MOVQ $0, gobuf_ret(BX) 170 MOVQ $0, gobuf_ctxt(BX) 171 MOVQ $0, gobuf_bp(BX) 172 MOVQ gobuf_pc(BX), BX 173 JMP BX 174 175 // func mcall(fn func(*g)) 176 // Switch to m->g0's stack, call fn(g). 177 // Fn must never return. It should gogo(&g->sched) 178 // to keep running g. 179 TEXT runtime·mcall(SB), NOSPLIT, $0-8 180 MOVQ fn+0(FP), DI 181 182 get_tls(CX) 183 MOVQ g(CX), AX // save state in g->sched 184 MOVQ 0(SP), BX // caller's PC 185 MOVQ BX, (g_sched+gobuf_pc)(AX) 186 LEAQ fn+0(FP), BX // caller's SP 187 MOVQ BX, (g_sched+gobuf_sp)(AX) 188 MOVQ AX, (g_sched+gobuf_g)(AX) 189 MOVQ BP, (g_sched+gobuf_bp)(AX) 190 191 // switch to m->g0 & its stack, call fn 192 MOVQ g(CX), BX 193 MOVQ g_m(BX), BX 194 MOVQ m_g0(BX), SI 195 CMPQ SI, AX // if g == m->g0 call badmcall 196 JNE 3(PC) 197 MOVQ $runtime·badmcall(SB), AX 198 JMP AX 199 MOVQ SI, g(CX) // g = m->g0 200 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 201 PUSHQ AX 202 MOVQ DI, DX 203 MOVQ 0(DI), DI 204 CALL DI 205 POPQ AX 206 MOVQ $runtime·badmcall2(SB), AX 207 JMP AX 208 RET 209 210 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 211 // of the G stack. We need to distinguish the routine that 212 // lives at the bottom of the G stack from the one that lives 213 // at the top of the system stack because the one at the top of 214 // the system stack terminates the stack walk (see topofstack()). 215 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 216 RET 217 218 // func systemstack(fn func()) 219 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 220 MOVQ fn+0(FP), DI // DI = fn 221 get_tls(CX) 222 MOVQ g(CX), AX // AX = g 223 MOVQ g_m(AX), BX // BX = m 224 225 MOVQ m_gsignal(BX), DX // DX = gsignal 226 CMPQ AX, DX 227 JEQ noswitch 228 229 MOVQ m_g0(BX), DX // DX = g0 230 CMPQ AX, DX 231 JEQ noswitch 232 233 MOVQ m_curg(BX), R8 234 CMPQ AX, R8 235 JEQ switch 236 237 // Bad: g is not gsignal, not g0, not curg. What is it? 238 MOVQ $runtime·badsystemstack(SB), AX 239 CALL AX 240 241 switch: 242 // save our state in g->sched. Pretend to 243 // be systemstack_switch if the G stack is scanned. 244 MOVQ $runtime·systemstack_switch(SB), SI 245 MOVQ SI, (g_sched+gobuf_pc)(AX) 246 MOVQ SP, (g_sched+gobuf_sp)(AX) 247 MOVQ AX, (g_sched+gobuf_g)(AX) 248 MOVQ BP, (g_sched+gobuf_bp)(AX) 249 250 // switch to g0 251 MOVQ DX, g(CX) 252 MOVQ (g_sched+gobuf_sp)(DX), BX 253 // make it look like mstart called systemstack on g0, to stop traceback 254 SUBQ $8, BX 255 MOVQ $runtime·mstart(SB), DX 256 MOVQ DX, 0(BX) 257 MOVQ BX, SP 258 259 // call target function 260 MOVQ DI, DX 261 MOVQ 0(DI), DI 262 CALL DI 263 264 // switch back to g 265 get_tls(CX) 266 MOVQ g(CX), AX 267 MOVQ g_m(AX), BX 268 MOVQ m_curg(BX), AX 269 MOVQ AX, g(CX) 270 MOVQ (g_sched+gobuf_sp)(AX), SP 271 MOVQ $0, (g_sched+gobuf_sp)(AX) 272 RET 273 274 noswitch: 275 // already on m stack, just call directly 276 MOVQ DI, DX 277 MOVQ 0(DI), DI 278 CALL DI 279 RET 280 281 /* 282 * support for morestack 283 */ 284 285 // Called during function prolog when more stack is needed. 286 // 287 // The traceback routines see morestack on a g0 as being 288 // the top of a stack (for example, morestack calling newstack 289 // calling the scheduler calling newm calling gc), so we must 290 // record an argument size. For that purpose, it has no arguments. 291 TEXT runtime·morestack(SB),NOSPLIT,$0-0 292 // Cannot grow scheduler stack (m->g0). 293 get_tls(CX) 294 MOVQ g(CX), BX 295 MOVQ g_m(BX), BX 296 MOVQ m_g0(BX), SI 297 CMPQ g(CX), SI 298 JNE 2(PC) 299 INT $3 300 301 // Cannot grow signal stack (m->gsignal). 302 MOVQ m_gsignal(BX), SI 303 CMPQ g(CX), SI 304 JNE 2(PC) 305 INT $3 306 307 // Called from f. 308 // Set m->morebuf to f's caller. 309 MOVQ 8(SP), AX // f's caller's PC 310 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 311 LEAQ 16(SP), AX // f's caller's SP 312 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 313 get_tls(CX) 314 MOVQ g(CX), SI 315 MOVQ SI, (m_morebuf+gobuf_g)(BX) 316 317 // Set g->sched to context in f. 318 MOVQ 0(SP), AX // f's PC 319 MOVQ AX, (g_sched+gobuf_pc)(SI) 320 MOVQ SI, (g_sched+gobuf_g)(SI) 321 LEAQ 8(SP), AX // f's SP 322 MOVQ AX, (g_sched+gobuf_sp)(SI) 323 MOVQ DX, (g_sched+gobuf_ctxt)(SI) 324 MOVQ BP, (g_sched+gobuf_bp)(SI) 325 326 // Call newstack on m->g0's stack. 327 MOVQ m_g0(BX), BX 328 MOVQ BX, g(CX) 329 MOVQ (g_sched+gobuf_sp)(BX), SP 330 CALL runtime·newstack(SB) 331 MOVQ $0, 0x1003 // crash if newstack returns 332 RET 333 334 // morestack but not preserving ctxt. 335 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 336 MOVL $0, DX 337 JMP runtime·morestack(SB) 338 339 TEXT runtime·stackBarrier(SB),NOSPLIT,$0 340 // We came here via a RET to an overwritten return PC. 341 // AX may be live. Other registers are available. 342 343 // Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal. 344 get_tls(CX) 345 MOVQ g(CX), CX 346 MOVQ (g_stkbar+slice_array)(CX), DX 347 MOVQ g_stkbarPos(CX), BX 348 IMULQ $stkbar__size, BX // Too big for SIB. 349 MOVQ stkbar_savedLRPtr(DX)(BX*1), R8 350 MOVQ stkbar_savedLRVal(DX)(BX*1), BX 351 // Assert that we're popping the right saved LR. 352 CMPQ R8, SP 353 JNE 2(PC) 354 MOVL $0, 0 355 // Record that this stack barrier was hit. 356 ADDQ $1, g_stkbarPos(CX) 357 // Jump to the original return PC. 358 JMP BX 359 360 // reflectcall: call a function with the given argument list 361 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 362 // we don't have variable-sized frames, so we use a small number 363 // of constant-sized-frame functions to encode a few bits of size in the pc. 364 // Caution: ugly multiline assembly macros in your future! 365 366 #define DISPATCH(NAME,MAXSIZE) \ 367 CMPQ CX, $MAXSIZE; \ 368 JA 3(PC); \ 369 MOVQ $NAME(SB), AX; \ 370 JMP AX 371 // Note: can't just "JMP NAME(SB)" - bad inlining results. 372 373 TEXT reflect·call(SB), NOSPLIT, $0-0 374 JMP ·reflectcall(SB) 375 376 TEXT ·reflectcall(SB), NOSPLIT, $0-32 377 MOVLQZX argsize+24(FP), CX 378 // NOTE(rsc): No call16, because CALLFN needs four words 379 // of argument space to invoke callwritebarrier. 380 DISPATCH(runtime·call32, 32) 381 DISPATCH(runtime·call64, 64) 382 DISPATCH(runtime·call128, 128) 383 DISPATCH(runtime·call256, 256) 384 DISPATCH(runtime·call512, 512) 385 DISPATCH(runtime·call1024, 1024) 386 DISPATCH(runtime·call2048, 2048) 387 DISPATCH(runtime·call4096, 4096) 388 DISPATCH(runtime·call8192, 8192) 389 DISPATCH(runtime·call16384, 16384) 390 DISPATCH(runtime·call32768, 32768) 391 DISPATCH(runtime·call65536, 65536) 392 DISPATCH(runtime·call131072, 131072) 393 DISPATCH(runtime·call262144, 262144) 394 DISPATCH(runtime·call524288, 524288) 395 DISPATCH(runtime·call1048576, 1048576) 396 DISPATCH(runtime·call2097152, 2097152) 397 DISPATCH(runtime·call4194304, 4194304) 398 DISPATCH(runtime·call8388608, 8388608) 399 DISPATCH(runtime·call16777216, 16777216) 400 DISPATCH(runtime·call33554432, 33554432) 401 DISPATCH(runtime·call67108864, 67108864) 402 DISPATCH(runtime·call134217728, 134217728) 403 DISPATCH(runtime·call268435456, 268435456) 404 DISPATCH(runtime·call536870912, 536870912) 405 DISPATCH(runtime·call1073741824, 1073741824) 406 MOVQ $runtime·badreflectcall(SB), AX 407 JMP AX 408 409 #define CALLFN(NAME,MAXSIZE) \ 410 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 411 NO_LOCAL_POINTERS; \ 412 /* copy arguments to stack */ \ 413 MOVQ argptr+16(FP), SI; \ 414 MOVLQZX argsize+24(FP), CX; \ 415 MOVQ SP, DI; \ 416 REP;MOVSB; \ 417 /* call function */ \ 418 MOVQ f+8(FP), DX; \ 419 PCDATA $PCDATA_StackMapIndex, $0; \ 420 CALL (DX); \ 421 /* copy return values back */ \ 422 MOVQ argptr+16(FP), DI; \ 423 MOVLQZX argsize+24(FP), CX; \ 424 MOVLQZX retoffset+28(FP), BX; \ 425 MOVQ SP, SI; \ 426 ADDQ BX, DI; \ 427 ADDQ BX, SI; \ 428 SUBQ BX, CX; \ 429 REP;MOVSB; \ 430 /* execute write barrier updates */ \ 431 MOVQ argtype+0(FP), DX; \ 432 MOVQ argptr+16(FP), DI; \ 433 MOVLQZX argsize+24(FP), CX; \ 434 MOVLQZX retoffset+28(FP), BX; \ 435 MOVQ DX, 0(SP); \ 436 MOVQ DI, 8(SP); \ 437 MOVQ CX, 16(SP); \ 438 MOVQ BX, 24(SP); \ 439 CALL runtime·callwritebarrier(SB); \ 440 RET 441 442 CALLFN(·call32, 32) 443 CALLFN(·call64, 64) 444 CALLFN(·call128, 128) 445 CALLFN(·call256, 256) 446 CALLFN(·call512, 512) 447 CALLFN(·call1024, 1024) 448 CALLFN(·call2048, 2048) 449 CALLFN(·call4096, 4096) 450 CALLFN(·call8192, 8192) 451 CALLFN(·call16384, 16384) 452 CALLFN(·call32768, 32768) 453 CALLFN(·call65536, 65536) 454 CALLFN(·call131072, 131072) 455 CALLFN(·call262144, 262144) 456 CALLFN(·call524288, 524288) 457 CALLFN(·call1048576, 1048576) 458 CALLFN(·call2097152, 2097152) 459 CALLFN(·call4194304, 4194304) 460 CALLFN(·call8388608, 8388608) 461 CALLFN(·call16777216, 16777216) 462 CALLFN(·call33554432, 33554432) 463 CALLFN(·call67108864, 67108864) 464 CALLFN(·call134217728, 134217728) 465 CALLFN(·call268435456, 268435456) 466 CALLFN(·call536870912, 536870912) 467 CALLFN(·call1073741824, 1073741824) 468 469 // bool cas(int32 *val, int32 old, int32 new) 470 // Atomically: 471 // if(*val == old){ 472 // *val = new; 473 // return 1; 474 // } else 475 // return 0; 476 TEXT runtime·cas(SB), NOSPLIT, $0-17 477 MOVQ ptr+0(FP), BX 478 MOVL old+8(FP), AX 479 MOVL new+12(FP), CX 480 LOCK 481 CMPXCHGL CX, 0(BX) 482 SETEQ ret+16(FP) 483 RET 484 485 // bool runtime·cas64(uint64 *val, uint64 old, uint64 new) 486 // Atomically: 487 // if(*val == *old){ 488 // *val = new; 489 // return 1; 490 // } else { 491 // return 0; 492 // } 493 TEXT runtime·cas64(SB), NOSPLIT, $0-25 494 MOVQ ptr+0(FP), BX 495 MOVQ old+8(FP), AX 496 MOVQ new+16(FP), CX 497 LOCK 498 CMPXCHGQ CX, 0(BX) 499 SETEQ ret+24(FP) 500 RET 501 502 TEXT runtime·casuintptr(SB), NOSPLIT, $0-25 503 JMP runtime·cas64(SB) 504 505 TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16 506 JMP runtime·atomicload64(SB) 507 508 TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16 509 JMP runtime·atomicload64(SB) 510 511 TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16 512 JMP runtime·atomicstore64(SB) 513 514 // bool casp(void **val, void *old, void *new) 515 // Atomically: 516 // if(*val == old){ 517 // *val = new; 518 // return 1; 519 // } else 520 // return 0; 521 TEXT runtime·casp1(SB), NOSPLIT, $0-25 522 MOVQ ptr+0(FP), BX 523 MOVQ old+8(FP), AX 524 MOVQ new+16(FP), CX 525 LOCK 526 CMPXCHGQ CX, 0(BX) 527 SETEQ ret+24(FP) 528 RET 529 530 // uint32 xadd(uint32 volatile *val, int32 delta) 531 // Atomically: 532 // *val += delta; 533 // return *val; 534 TEXT runtime·xadd(SB), NOSPLIT, $0-20 535 MOVQ ptr+0(FP), BX 536 MOVL delta+8(FP), AX 537 MOVL AX, CX 538 LOCK 539 XADDL AX, 0(BX) 540 ADDL CX, AX 541 MOVL AX, ret+16(FP) 542 RET 543 544 TEXT runtime·xadd64(SB), NOSPLIT, $0-24 545 MOVQ ptr+0(FP), BX 546 MOVQ delta+8(FP), AX 547 MOVQ AX, CX 548 LOCK 549 XADDQ AX, 0(BX) 550 ADDQ CX, AX 551 MOVQ AX, ret+16(FP) 552 RET 553 554 TEXT runtime·xadduintptr(SB), NOSPLIT, $0-24 555 JMP runtime·xadd64(SB) 556 557 TEXT runtime·xchg(SB), NOSPLIT, $0-20 558 MOVQ ptr+0(FP), BX 559 MOVL new+8(FP), AX 560 XCHGL AX, 0(BX) 561 MOVL AX, ret+16(FP) 562 RET 563 564 TEXT runtime·xchg64(SB), NOSPLIT, $0-24 565 MOVQ ptr+0(FP), BX 566 MOVQ new+8(FP), AX 567 XCHGQ AX, 0(BX) 568 MOVQ AX, ret+16(FP) 569 RET 570 571 TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24 572 JMP runtime·xchg64(SB) 573 574 TEXT runtime·procyield(SB),NOSPLIT,$0-0 575 MOVL cycles+0(FP), AX 576 again: 577 PAUSE 578 SUBL $1, AX 579 JNZ again 580 RET 581 582 TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16 583 MOVQ ptr+0(FP), BX 584 MOVQ val+8(FP), AX 585 XCHGQ AX, 0(BX) 586 RET 587 588 TEXT runtime·atomicstore(SB), NOSPLIT, $0-12 589 MOVQ ptr+0(FP), BX 590 MOVL val+8(FP), AX 591 XCHGL AX, 0(BX) 592 RET 593 594 TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16 595 MOVQ ptr+0(FP), BX 596 MOVQ val+8(FP), AX 597 XCHGQ AX, 0(BX) 598 RET 599 600 // void runtime·atomicor8(byte volatile*, byte); 601 TEXT runtime·atomicor8(SB), NOSPLIT, $0-9 602 MOVQ ptr+0(FP), AX 603 MOVB val+8(FP), BX 604 LOCK 605 ORB BX, (AX) 606 RET 607 608 // void runtime·atomicand8(byte volatile*, byte); 609 TEXT runtime·atomicand8(SB), NOSPLIT, $0-9 610 MOVQ ptr+0(FP), AX 611 MOVB val+8(FP), BX 612 LOCK 613 ANDB BX, (AX) 614 RET 615 616 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 617 // Stores are already ordered on x86, so this is just a 618 // compile barrier. 619 RET 620 621 // void jmpdefer(fn, sp); 622 // called from deferreturn. 623 // 1. pop the caller 624 // 2. sub 5 bytes from the callers return 625 // 3. jmp to the argument 626 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 627 MOVQ fv+0(FP), DX // fn 628 MOVQ argp+8(FP), BX // caller sp 629 LEAQ -8(BX), SP // caller sp after CALL 630 SUBQ $5, (SP) // return to CALL again 631 MOVQ 0(DX), BX 632 JMP BX // but first run the deferred function 633 634 // Save state of caller into g->sched. Smashes R8, R9. 635 TEXT gosave<>(SB),NOSPLIT,$0 636 get_tls(R8) 637 MOVQ g(R8), R8 638 MOVQ 0(SP), R9 639 MOVQ R9, (g_sched+gobuf_pc)(R8) 640 LEAQ 8(SP), R9 641 MOVQ R9, (g_sched+gobuf_sp)(R8) 642 MOVQ $0, (g_sched+gobuf_ret)(R8) 643 MOVQ $0, (g_sched+gobuf_ctxt)(R8) 644 MOVQ BP, (g_sched+gobuf_bp)(R8) 645 RET 646 647 // func asmcgocall(fn, arg unsafe.Pointer) int32 648 // Call fn(arg) on the scheduler stack, 649 // aligned appropriately for the gcc ABI. 650 // See cgocall.go for more details. 651 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 652 MOVQ fn+0(FP), AX 653 MOVQ arg+8(FP), BX 654 655 MOVQ SP, DX 656 657 // Figure out if we need to switch to m->g0 stack. 658 // We get called to create new OS threads too, and those 659 // come in on the m->g0 stack already. 660 get_tls(CX) 661 MOVQ g(CX), R8 662 MOVQ g_m(R8), R8 663 MOVQ m_g0(R8), SI 664 MOVQ g(CX), DI 665 CMPQ SI, DI 666 JEQ nosave 667 MOVQ m_gsignal(R8), SI 668 CMPQ SI, DI 669 JEQ nosave 670 671 MOVQ m_g0(R8), SI 672 CALL gosave<>(SB) 673 MOVQ SI, g(CX) 674 MOVQ (g_sched+gobuf_sp)(SI), SP 675 nosave: 676 677 // Now on a scheduling stack (a pthread-created stack). 678 // Make sure we have enough room for 4 stack-backed fast-call 679 // registers as per windows amd64 calling convention. 680 SUBQ $64, SP 681 ANDQ $~15, SP // alignment for gcc ABI 682 MOVQ DI, 48(SP) // save g 683 MOVQ (g_stack+stack_hi)(DI), DI 684 SUBQ DX, DI 685 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 686 MOVQ BX, DI // DI = first argument in AMD64 ABI 687 MOVQ BX, CX // CX = first argument in Win64 688 CALL AX 689 690 // Restore registers, g, stack pointer. 691 get_tls(CX) 692 MOVQ 48(SP), DI 693 MOVQ (g_stack+stack_hi)(DI), SI 694 SUBQ 40(SP), SI 695 MOVQ DI, g(CX) 696 MOVQ SI, SP 697 698 MOVL AX, ret+16(FP) 699 RET 700 701 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize) 702 // Turn the fn into a Go func (by taking its address) and call 703 // cgocallback_gofunc. 704 TEXT runtime·cgocallback(SB),NOSPLIT,$24-24 705 LEAQ fn+0(FP), AX 706 MOVQ AX, 0(SP) 707 MOVQ frame+8(FP), AX 708 MOVQ AX, 8(SP) 709 MOVQ framesize+16(FP), AX 710 MOVQ AX, 16(SP) 711 MOVQ $runtime·cgocallback_gofunc(SB), AX 712 CALL AX 713 RET 714 715 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) 716 // See cgocall.go for more details. 717 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24 718 NO_LOCAL_POINTERS 719 720 // If g is nil, Go did not create the current thread. 721 // Call needm to obtain one m for temporary use. 722 // In this case, we're running on the thread stack, so there's 723 // lots of space, but the linker doesn't know. Hide the call from 724 // the linker analysis by using an indirect call through AX. 725 get_tls(CX) 726 #ifdef GOOS_windows 727 MOVL $0, BX 728 CMPQ CX, $0 729 JEQ 2(PC) 730 #endif 731 MOVQ g(CX), BX 732 CMPQ BX, $0 733 JEQ needm 734 MOVQ g_m(BX), BX 735 MOVQ BX, R8 // holds oldm until end of function 736 JMP havem 737 needm: 738 MOVQ $0, 0(SP) 739 MOVQ $runtime·needm(SB), AX 740 CALL AX 741 MOVQ 0(SP), R8 742 get_tls(CX) 743 MOVQ g(CX), BX 744 MOVQ g_m(BX), BX 745 746 // Set m->sched.sp = SP, so that if a panic happens 747 // during the function we are about to execute, it will 748 // have a valid SP to run on the g0 stack. 749 // The next few lines (after the havem label) 750 // will save this SP onto the stack and then write 751 // the same SP back to m->sched.sp. That seems redundant, 752 // but if an unrecovered panic happens, unwindm will 753 // restore the g->sched.sp from the stack location 754 // and then systemstack will try to use it. If we don't set it here, 755 // that restored SP will be uninitialized (typically 0) and 756 // will not be usable. 757 MOVQ m_g0(BX), SI 758 MOVQ SP, (g_sched+gobuf_sp)(SI) 759 760 havem: 761 // Now there's a valid m, and we're running on its m->g0. 762 // Save current m->g0->sched.sp on stack and then set it to SP. 763 // Save current sp in m->g0->sched.sp in preparation for 764 // switch back to m->curg stack. 765 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 766 MOVQ m_g0(BX), SI 767 MOVQ (g_sched+gobuf_sp)(SI), AX 768 MOVQ AX, 0(SP) 769 MOVQ SP, (g_sched+gobuf_sp)(SI) 770 771 // Switch to m->curg stack and call runtime.cgocallbackg. 772 // Because we are taking over the execution of m->curg 773 // but *not* resuming what had been running, we need to 774 // save that information (m->curg->sched) so we can restore it. 775 // We can restore m->curg->sched.sp easily, because calling 776 // runtime.cgocallbackg leaves SP unchanged upon return. 777 // To save m->curg->sched.pc, we push it onto the stack. 778 // This has the added benefit that it looks to the traceback 779 // routine like cgocallbackg is going to return to that 780 // PC (because the frame we allocate below has the same 781 // size as cgocallback_gofunc's frame declared above) 782 // so that the traceback will seamlessly trace back into 783 // the earlier calls. 784 // 785 // In the new goroutine, 0(SP) holds the saved R8. 786 MOVQ m_curg(BX), SI 787 MOVQ SI, g(CX) 788 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 789 MOVQ (g_sched+gobuf_pc)(SI), BX 790 MOVQ BX, -8(DI) 791 // Compute the size of the frame, including return PC and, if 792 // GOEXPERIMENT=framepointer, the saved based pointer 793 LEAQ fv+0(FP), AX 794 SUBQ SP, AX 795 SUBQ AX, DI 796 MOVQ DI, SP 797 798 MOVQ R8, 0(SP) 799 CALL runtime·cgocallbackg(SB) 800 MOVQ 0(SP), R8 801 802 // Compute the size of the frame again. FP and SP have 803 // completely different values here than they did above, 804 // but only their difference matters. 805 LEAQ fv+0(FP), AX 806 SUBQ SP, AX 807 808 // Restore g->sched (== m->curg->sched) from saved values. 809 get_tls(CX) 810 MOVQ g(CX), SI 811 MOVQ SP, DI 812 ADDQ AX, DI 813 MOVQ -8(DI), BX 814 MOVQ BX, (g_sched+gobuf_pc)(SI) 815 MOVQ DI, (g_sched+gobuf_sp)(SI) 816 817 // Switch back to m->g0's stack and restore m->g0->sched.sp. 818 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 819 // so we do not have to restore it.) 820 MOVQ g(CX), BX 821 MOVQ g_m(BX), BX 822 MOVQ m_g0(BX), SI 823 MOVQ SI, g(CX) 824 MOVQ (g_sched+gobuf_sp)(SI), SP 825 MOVQ 0(SP), AX 826 MOVQ AX, (g_sched+gobuf_sp)(SI) 827 828 // If the m on entry was nil, we called needm above to borrow an m 829 // for the duration of the call. Since the call is over, return it with dropm. 830 CMPQ R8, $0 831 JNE 3(PC) 832 MOVQ $runtime·dropm(SB), AX 833 CALL AX 834 835 // Done! 836 RET 837 838 // void setg(G*); set g. for use by needm. 839 TEXT runtime·setg(SB), NOSPLIT, $0-8 840 MOVQ gg+0(FP), BX 841 #ifdef GOOS_windows 842 CMPQ BX, $0 843 JNE settls 844 MOVQ $0, 0x28(GS) 845 RET 846 settls: 847 MOVQ g_m(BX), AX 848 LEAQ m_tls(AX), AX 849 MOVQ AX, 0x28(GS) 850 #endif 851 get_tls(CX) 852 MOVQ BX, g(CX) 853 RET 854 855 // void setg_gcc(G*); set g called from gcc. 856 TEXT setg_gcc<>(SB),NOSPLIT,$0 857 get_tls(AX) 858 MOVQ DI, g(AX) 859 RET 860 861 // check that SP is in range [g->stack.lo, g->stack.hi) 862 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 863 get_tls(CX) 864 MOVQ g(CX), AX 865 CMPQ (g_stack+stack_hi)(AX), SP 866 JHI 2(PC) 867 INT $3 868 CMPQ SP, (g_stack+stack_lo)(AX) 869 JHI 2(PC) 870 INT $3 871 RET 872 873 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 874 MOVQ argp+0(FP),AX // addr of first arg 875 MOVQ -8(AX),AX // get calling pc 876 CMPQ AX, runtime·stackBarrierPC(SB) 877 JNE nobar 878 // Get original return PC. 879 CALL runtime·nextBarrierPC(SB) 880 MOVQ 0(SP), AX 881 nobar: 882 MOVQ AX, ret+8(FP) 883 RET 884 885 TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16 886 MOVQ argp+0(FP),AX // addr of first arg 887 MOVQ pc+8(FP), BX 888 MOVQ -8(AX), CX 889 CMPQ CX, runtime·stackBarrierPC(SB) 890 JEQ setbar 891 MOVQ BX, -8(AX) // set calling pc 892 RET 893 setbar: 894 // Set the stack barrier return PC. 895 MOVQ BX, 0(SP) 896 CALL runtime·setNextBarrierPC(SB) 897 RET 898 899 TEXT runtime·getcallersp(SB),NOSPLIT,$0-16 900 MOVQ argp+0(FP), AX 901 MOVQ AX, ret+8(FP) 902 RET 903 904 // func cputicks() int64 905 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 906 CMPB runtime·lfenceBeforeRdtsc(SB), $1 907 JNE mfence 908 BYTE $0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE 909 JMP done 910 mfence: 911 BYTE $0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE 912 done: 913 RDTSC 914 SHLQ $32, DX 915 ADDQ DX, AX 916 MOVQ AX, ret+0(FP) 917 RET 918 919 // memhash_varlen(p unsafe.Pointer, h seed) uintptr 920 // redirects to memhash(p, h, size) using the size 921 // stored in the closure. 922 TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24 923 GO_ARGS 924 NO_LOCAL_POINTERS 925 MOVQ p+0(FP), AX 926 MOVQ h+8(FP), BX 927 MOVQ 8(DX), CX 928 MOVQ AX, 0(SP) 929 MOVQ BX, 8(SP) 930 MOVQ CX, 16(SP) 931 CALL runtime·memhash(SB) 932 MOVQ 24(SP), AX 933 MOVQ AX, ret+16(FP) 934 RET 935 936 // hash function using AES hardware instructions 937 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 938 MOVQ p+0(FP), AX // ptr to data 939 MOVQ s+16(FP), CX // size 940 LEAQ ret+24(FP), DX 941 JMP runtime·aeshashbody(SB) 942 943 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 944 MOVQ p+0(FP), AX // ptr to string struct 945 MOVQ 8(AX), CX // length of string 946 MOVQ (AX), AX // string data 947 LEAQ ret+16(FP), DX 948 JMP runtime·aeshashbody(SB) 949 950 // AX: data 951 // CX: length 952 // DX: address to put return value 953 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 954 // Fill an SSE register with our seeds. 955 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 956 PINSRW $4, CX, X0 // 16 bits of length 957 PSHUFHW $0, X0, X0 // repeat length 4 times total 958 MOVO X0, X1 // save unscrambled seed 959 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 960 AESENC X0, X0 // scramble seed 961 962 CMPQ CX, $16 963 JB aes0to15 964 JE aes16 965 CMPQ CX, $32 966 JBE aes17to32 967 CMPQ CX, $64 968 JBE aes33to64 969 CMPQ CX, $128 970 JBE aes65to128 971 JMP aes129plus 972 973 aes0to15: 974 TESTQ CX, CX 975 JE aes0 976 977 ADDQ $16, AX 978 TESTW $0xff0, AX 979 JE endofpage 980 981 // 16 bytes loaded at this address won't cross 982 // a page boundary, so we can load it directly. 983 MOVOU -16(AX), X1 984 ADDQ CX, CX 985 MOVQ $masks<>(SB), AX 986 PAND (AX)(CX*8), X1 987 final1: 988 AESENC X0, X1 // scramble input, xor in seed 989 AESENC X1, X1 // scramble combo 2 times 990 AESENC X1, X1 991 MOVQ X1, (DX) 992 RET 993 994 endofpage: 995 // address ends in 1111xxxx. Might be up against 996 // a page boundary, so load ending at last byte. 997 // Then shift bytes down using pshufb. 998 MOVOU -32(AX)(CX*1), X1 999 ADDQ CX, CX 1000 MOVQ $shifts<>(SB), AX 1001 PSHUFB (AX)(CX*8), X1 1002 JMP final1 1003 1004 aes0: 1005 // Return scrambled input seed 1006 AESENC X0, X0 1007 MOVQ X0, (DX) 1008 RET 1009 1010 aes16: 1011 MOVOU (AX), X1 1012 JMP final1 1013 1014 aes17to32: 1015 // make second starting seed 1016 PXOR runtime·aeskeysched+16(SB), X1 1017 AESENC X1, X1 1018 1019 // load data to be hashed 1020 MOVOU (AX), X2 1021 MOVOU -16(AX)(CX*1), X3 1022 1023 // scramble 3 times 1024 AESENC X0, X2 1025 AESENC X1, X3 1026 AESENC X2, X2 1027 AESENC X3, X3 1028 AESENC X2, X2 1029 AESENC X3, X3 1030 1031 // combine results 1032 PXOR X3, X2 1033 MOVQ X2, (DX) 1034 RET 1035 1036 aes33to64: 1037 // make 3 more starting seeds 1038 MOVO X1, X2 1039 MOVO X1, X3 1040 PXOR runtime·aeskeysched+16(SB), X1 1041 PXOR runtime·aeskeysched+32(SB), X2 1042 PXOR runtime·aeskeysched+48(SB), X3 1043 AESENC X1, X1 1044 AESENC X2, X2 1045 AESENC X3, X3 1046 1047 MOVOU (AX), X4 1048 MOVOU 16(AX), X5 1049 MOVOU -32(AX)(CX*1), X6 1050 MOVOU -16(AX)(CX*1), X7 1051 1052 AESENC X0, X4 1053 AESENC X1, X5 1054 AESENC X2, X6 1055 AESENC X3, X7 1056 1057 AESENC X4, X4 1058 AESENC X5, X5 1059 AESENC X6, X6 1060 AESENC X7, X7 1061 1062 AESENC X4, X4 1063 AESENC X5, X5 1064 AESENC X6, X6 1065 AESENC X7, X7 1066 1067 PXOR X6, X4 1068 PXOR X7, X5 1069 PXOR X5, X4 1070 MOVQ X4, (DX) 1071 RET 1072 1073 aes65to128: 1074 // make 7 more starting seeds 1075 MOVO X1, X2 1076 MOVO X1, X3 1077 MOVO X1, X4 1078 MOVO X1, X5 1079 MOVO X1, X6 1080 MOVO X1, X7 1081 PXOR runtime·aeskeysched+16(SB), X1 1082 PXOR runtime·aeskeysched+32(SB), X2 1083 PXOR runtime·aeskeysched+48(SB), X3 1084 PXOR runtime·aeskeysched+64(SB), X4 1085 PXOR runtime·aeskeysched+80(SB), X5 1086 PXOR runtime·aeskeysched+96(SB), X6 1087 PXOR runtime·aeskeysched+112(SB), X7 1088 AESENC X1, X1 1089 AESENC X2, X2 1090 AESENC X3, X3 1091 AESENC X4, X4 1092 AESENC X5, X5 1093 AESENC X6, X6 1094 AESENC X7, X7 1095 1096 // load data 1097 MOVOU (AX), X8 1098 MOVOU 16(AX), X9 1099 MOVOU 32(AX), X10 1100 MOVOU 48(AX), X11 1101 MOVOU -64(AX)(CX*1), X12 1102 MOVOU -48(AX)(CX*1), X13 1103 MOVOU -32(AX)(CX*1), X14 1104 MOVOU -16(AX)(CX*1), X15 1105 1106 // scramble data, xor in seed 1107 AESENC X0, X8 1108 AESENC X1, X9 1109 AESENC X2, X10 1110 AESENC X3, X11 1111 AESENC X4, X12 1112 AESENC X5, X13 1113 AESENC X6, X14 1114 AESENC X7, X15 1115 1116 // scramble twice 1117 AESENC X8, X8 1118 AESENC X9, X9 1119 AESENC X10, X10 1120 AESENC X11, X11 1121 AESENC X12, X12 1122 AESENC X13, X13 1123 AESENC X14, X14 1124 AESENC X15, X15 1125 1126 AESENC X8, X8 1127 AESENC X9, X9 1128 AESENC X10, X10 1129 AESENC X11, X11 1130 AESENC X12, X12 1131 AESENC X13, X13 1132 AESENC X14, X14 1133 AESENC X15, X15 1134 1135 // combine results 1136 PXOR X12, X8 1137 PXOR X13, X9 1138 PXOR X14, X10 1139 PXOR X15, X11 1140 PXOR X10, X8 1141 PXOR X11, X9 1142 PXOR X9, X8 1143 MOVQ X8, (DX) 1144 RET 1145 1146 aes129plus: 1147 // make 7 more starting seeds 1148 MOVO X1, X2 1149 MOVO X1, X3 1150 MOVO X1, X4 1151 MOVO X1, X5 1152 MOVO X1, X6 1153 MOVO X1, X7 1154 PXOR runtime·aeskeysched+16(SB), X1 1155 PXOR runtime·aeskeysched+32(SB), X2 1156 PXOR runtime·aeskeysched+48(SB), X3 1157 PXOR runtime·aeskeysched+64(SB), X4 1158 PXOR runtime·aeskeysched+80(SB), X5 1159 PXOR runtime·aeskeysched+96(SB), X6 1160 PXOR runtime·aeskeysched+112(SB), X7 1161 AESENC X1, X1 1162 AESENC X2, X2 1163 AESENC X3, X3 1164 AESENC X4, X4 1165 AESENC X5, X5 1166 AESENC X6, X6 1167 AESENC X7, X7 1168 1169 // start with last (possibly overlapping) block 1170 MOVOU -128(AX)(CX*1), X8 1171 MOVOU -112(AX)(CX*1), X9 1172 MOVOU -96(AX)(CX*1), X10 1173 MOVOU -80(AX)(CX*1), X11 1174 MOVOU -64(AX)(CX*1), X12 1175 MOVOU -48(AX)(CX*1), X13 1176 MOVOU -32(AX)(CX*1), X14 1177 MOVOU -16(AX)(CX*1), X15 1178 1179 // scramble input once, xor in seed 1180 AESENC X0, X8 1181 AESENC X1, X9 1182 AESENC X2, X10 1183 AESENC X3, X11 1184 AESENC X4, X12 1185 AESENC X5, X13 1186 AESENC X6, X14 1187 AESENC X7, X15 1188 1189 // compute number of remaining 128-byte blocks 1190 DECQ CX 1191 SHRQ $7, CX 1192 1193 aesloop: 1194 // scramble state, xor in a block 1195 MOVOU (AX), X0 1196 MOVOU 16(AX), X1 1197 MOVOU 32(AX), X2 1198 MOVOU 48(AX), X3 1199 AESENC X0, X8 1200 AESENC X1, X9 1201 AESENC X2, X10 1202 AESENC X3, X11 1203 MOVOU 64(AX), X4 1204 MOVOU 80(AX), X5 1205 MOVOU 96(AX), X6 1206 MOVOU 112(AX), X7 1207 AESENC X4, X12 1208 AESENC X5, X13 1209 AESENC X6, X14 1210 AESENC X7, X15 1211 1212 // scramble state 1213 AESENC X8, X8 1214 AESENC X9, X9 1215 AESENC X10, X10 1216 AESENC X11, X11 1217 AESENC X12, X12 1218 AESENC X13, X13 1219 AESENC X14, X14 1220 AESENC X15, X15 1221 1222 ADDQ $128, AX 1223 DECQ CX 1224 JNE aesloop 1225 1226 // 2 more scrambles to finish 1227 AESENC X8, X8 1228 AESENC X9, X9 1229 AESENC X10, X10 1230 AESENC X11, X11 1231 AESENC X12, X12 1232 AESENC X13, X13 1233 AESENC X14, X14 1234 AESENC X15, X15 1235 AESENC X8, X8 1236 AESENC X9, X9 1237 AESENC X10, X10 1238 AESENC X11, X11 1239 AESENC X12, X12 1240 AESENC X13, X13 1241 AESENC X14, X14 1242 AESENC X15, X15 1243 1244 PXOR X12, X8 1245 PXOR X13, X9 1246 PXOR X14, X10 1247 PXOR X15, X11 1248 PXOR X10, X8 1249 PXOR X11, X9 1250 PXOR X9, X8 1251 MOVQ X8, (DX) 1252 RET 1253 1254 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1255 MOVQ p+0(FP), AX // ptr to data 1256 MOVQ h+8(FP), X0 // seed 1257 PINSRD $2, (AX), X0 // data 1258 AESENC runtime·aeskeysched+0(SB), X0 1259 AESENC runtime·aeskeysched+16(SB), X0 1260 AESENC runtime·aeskeysched+32(SB), X0 1261 MOVQ X0, ret+16(FP) 1262 RET 1263 1264 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1265 MOVQ p+0(FP), AX // ptr to data 1266 MOVQ h+8(FP), X0 // seed 1267 PINSRQ $1, (AX), X0 // data 1268 AESENC runtime·aeskeysched+0(SB), X0 1269 AESENC runtime·aeskeysched+16(SB), X0 1270 AESENC runtime·aeskeysched+32(SB), X0 1271 MOVQ X0, ret+16(FP) 1272 RET 1273 1274 // simple mask to get rid of data in the high part of the register. 1275 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1276 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1277 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1278 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1279 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1280 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1281 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1282 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1283 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1284 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1285 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1286 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1287 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1288 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1289 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1290 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1291 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1292 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1293 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1294 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1295 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1296 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1297 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1298 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1299 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1300 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1301 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1302 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1303 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1304 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1305 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1306 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1307 GLOBL masks<>(SB),RODATA,$256 1308 1309 // these are arguments to pshufb. They move data down from 1310 // the high bytes of the register to the low bytes of the register. 1311 // index is how many bytes to move. 1312 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1313 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1314 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1315 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1316 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1317 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1318 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1319 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1320 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1321 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1322 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1323 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1324 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1325 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1326 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1327 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1328 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1329 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1330 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1331 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1332 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1333 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1334 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1335 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1336 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1337 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1338 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1339 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1340 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1341 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1342 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1343 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1344 GLOBL shifts<>(SB),RODATA,$256 1345 1346 TEXT runtime·memeq(SB),NOSPLIT,$0-25 1347 MOVQ a+0(FP), SI 1348 MOVQ b+8(FP), DI 1349 MOVQ size+16(FP), BX 1350 LEAQ ret+24(FP), AX 1351 JMP runtime·memeqbody(SB) 1352 1353 // memequal_varlen(a, b unsafe.Pointer) bool 1354 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1355 MOVQ a+0(FP), SI 1356 MOVQ b+8(FP), DI 1357 CMPQ SI, DI 1358 JEQ eq 1359 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1360 LEAQ ret+16(FP), AX 1361 JMP runtime·memeqbody(SB) 1362 eq: 1363 MOVB $1, ret+16(FP) 1364 RET 1365 1366 // eqstring tests whether two strings are equal. 1367 // The compiler guarantees that strings passed 1368 // to eqstring have equal length. 1369 // See runtime_test.go:eqstring_generic for 1370 // equivalent Go code. 1371 TEXT runtime·eqstring(SB),NOSPLIT,$0-33 1372 MOVQ s1str+0(FP), SI 1373 MOVQ s2str+16(FP), DI 1374 CMPQ SI, DI 1375 JEQ eq 1376 MOVQ s1len+8(FP), BX 1377 LEAQ v+32(FP), AX 1378 JMP runtime·memeqbody(SB) 1379 eq: 1380 MOVB $1, v+32(FP) 1381 RET 1382 1383 // a in SI 1384 // b in DI 1385 // count in BX 1386 // address of result byte in AX 1387 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1388 CMPQ BX, $8 1389 JB small 1390 1391 // 64 bytes at a time using xmm registers 1392 hugeloop: 1393 CMPQ BX, $64 1394 JB bigloop 1395 MOVOU (SI), X0 1396 MOVOU (DI), X1 1397 MOVOU 16(SI), X2 1398 MOVOU 16(DI), X3 1399 MOVOU 32(SI), X4 1400 MOVOU 32(DI), X5 1401 MOVOU 48(SI), X6 1402 MOVOU 48(DI), X7 1403 PCMPEQB X1, X0 1404 PCMPEQB X3, X2 1405 PCMPEQB X5, X4 1406 PCMPEQB X7, X6 1407 PAND X2, X0 1408 PAND X6, X4 1409 PAND X4, X0 1410 PMOVMSKB X0, DX 1411 ADDQ $64, SI 1412 ADDQ $64, DI 1413 SUBQ $64, BX 1414 CMPL DX, $0xffff 1415 JEQ hugeloop 1416 MOVB $0, (AX) 1417 RET 1418 1419 // 8 bytes at a time using 64-bit register 1420 bigloop: 1421 CMPQ BX, $8 1422 JBE leftover 1423 MOVQ (SI), CX 1424 MOVQ (DI), DX 1425 ADDQ $8, SI 1426 ADDQ $8, DI 1427 SUBQ $8, BX 1428 CMPQ CX, DX 1429 JEQ bigloop 1430 MOVB $0, (AX) 1431 RET 1432 1433 // remaining 0-8 bytes 1434 leftover: 1435 MOVQ -8(SI)(BX*1), CX 1436 MOVQ -8(DI)(BX*1), DX 1437 CMPQ CX, DX 1438 SETEQ (AX) 1439 RET 1440 1441 small: 1442 CMPQ BX, $0 1443 JEQ equal 1444 1445 LEAQ 0(BX*8), CX 1446 NEGQ CX 1447 1448 CMPB SI, $0xf8 1449 JA si_high 1450 1451 // load at SI won't cross a page boundary. 1452 MOVQ (SI), SI 1453 JMP si_finish 1454 si_high: 1455 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1456 MOVQ -8(SI)(BX*1), SI 1457 SHRQ CX, SI 1458 si_finish: 1459 1460 // same for DI. 1461 CMPB DI, $0xf8 1462 JA di_high 1463 MOVQ (DI), DI 1464 JMP di_finish 1465 di_high: 1466 MOVQ -8(DI)(BX*1), DI 1467 SHRQ CX, DI 1468 di_finish: 1469 1470 SUBQ SI, DI 1471 SHLQ CX, DI 1472 equal: 1473 SETEQ (AX) 1474 RET 1475 1476 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1477 MOVQ s1_base+0(FP), SI 1478 MOVQ s1_len+8(FP), BX 1479 MOVQ s2_base+16(FP), DI 1480 MOVQ s2_len+24(FP), DX 1481 LEAQ ret+32(FP), R9 1482 JMP runtime·cmpbody(SB) 1483 1484 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1485 MOVQ s1+0(FP), SI 1486 MOVQ s1+8(FP), BX 1487 MOVQ s2+24(FP), DI 1488 MOVQ s2+32(FP), DX 1489 LEAQ res+48(FP), R9 1490 JMP runtime·cmpbody(SB) 1491 1492 // input: 1493 // SI = a 1494 // DI = b 1495 // BX = alen 1496 // DX = blen 1497 // R9 = address of output word (stores -1/0/1 here) 1498 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1499 CMPQ SI, DI 1500 JEQ allsame 1501 CMPQ BX, DX 1502 MOVQ DX, R8 1503 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1504 CMPQ R8, $8 1505 JB small 1506 1507 CMPQ R8, $63 1508 JA big_loop 1509 loop: 1510 CMPQ R8, $16 1511 JBE _0through16 1512 MOVOU (SI), X0 1513 MOVOU (DI), X1 1514 PCMPEQB X0, X1 1515 PMOVMSKB X1, AX 1516 XORQ $0xffff, AX // convert EQ to NE 1517 JNE diff16 // branch if at least one byte is not equal 1518 ADDQ $16, SI 1519 ADDQ $16, DI 1520 SUBQ $16, R8 1521 JMP loop 1522 1523 diff64: 1524 ADDQ $48, SI 1525 ADDQ $48, DI 1526 JMP diff16 1527 diff48: 1528 ADDQ $32, SI 1529 ADDQ $32, DI 1530 JMP diff16 1531 diff32: 1532 ADDQ $16, SI 1533 ADDQ $16, DI 1534 // AX = bit mask of differences 1535 diff16: 1536 BSFQ AX, BX // index of first byte that differs 1537 XORQ AX, AX 1538 MOVB (SI)(BX*1), CX 1539 CMPB CX, (DI)(BX*1) 1540 SETHI AX 1541 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1542 MOVQ AX, (R9) 1543 RET 1544 1545 // 0 through 16 bytes left, alen>=8, blen>=8 1546 _0through16: 1547 CMPQ R8, $8 1548 JBE _0through8 1549 MOVQ (SI), AX 1550 MOVQ (DI), CX 1551 CMPQ AX, CX 1552 JNE diff8 1553 _0through8: 1554 MOVQ -8(SI)(R8*1), AX 1555 MOVQ -8(DI)(R8*1), CX 1556 CMPQ AX, CX 1557 JEQ allsame 1558 1559 // AX and CX contain parts of a and b that differ. 1560 diff8: 1561 BSWAPQ AX // reverse order of bytes 1562 BSWAPQ CX 1563 XORQ AX, CX 1564 BSRQ CX, CX // index of highest bit difference 1565 SHRQ CX, AX // move a's bit to bottom 1566 ANDQ $1, AX // mask bit 1567 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1568 MOVQ AX, (R9) 1569 RET 1570 1571 // 0-7 bytes in common 1572 small: 1573 LEAQ (R8*8), CX // bytes left -> bits left 1574 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1575 JEQ allsame 1576 1577 // load bytes of a into high bytes of AX 1578 CMPB SI, $0xf8 1579 JA si_high 1580 MOVQ (SI), SI 1581 JMP si_finish 1582 si_high: 1583 MOVQ -8(SI)(R8*1), SI 1584 SHRQ CX, SI 1585 si_finish: 1586 SHLQ CX, SI 1587 1588 // load bytes of b in to high bytes of BX 1589 CMPB DI, $0xf8 1590 JA di_high 1591 MOVQ (DI), DI 1592 JMP di_finish 1593 di_high: 1594 MOVQ -8(DI)(R8*1), DI 1595 SHRQ CX, DI 1596 di_finish: 1597 SHLQ CX, DI 1598 1599 BSWAPQ SI // reverse order of bytes 1600 BSWAPQ DI 1601 XORQ SI, DI // find bit differences 1602 JEQ allsame 1603 BSRQ DI, CX // index of highest bit difference 1604 SHRQ CX, SI // move a's bit to bottom 1605 ANDQ $1, SI // mask bit 1606 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1607 MOVQ AX, (R9) 1608 RET 1609 1610 allsame: 1611 XORQ AX, AX 1612 XORQ CX, CX 1613 CMPQ BX, DX 1614 SETGT AX // 1 if alen > blen 1615 SETEQ CX // 1 if alen == blen 1616 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1617 MOVQ AX, (R9) 1618 RET 1619 1620 // this works for >= 64 bytes of data. 1621 big_loop: 1622 MOVOU (SI), X0 1623 MOVOU (DI), X1 1624 PCMPEQB X0, X1 1625 PMOVMSKB X1, AX 1626 XORQ $0xffff, AX 1627 JNE diff16 1628 1629 MOVOU 16(SI), X0 1630 MOVOU 16(DI), X1 1631 PCMPEQB X0, X1 1632 PMOVMSKB X1, AX 1633 XORQ $0xffff, AX 1634 JNE diff32 1635 1636 MOVOU 32(SI), X0 1637 MOVOU 32(DI), X1 1638 PCMPEQB X0, X1 1639 PMOVMSKB X1, AX 1640 XORQ $0xffff, AX 1641 JNE diff48 1642 1643 MOVOU 48(SI), X0 1644 MOVOU 48(DI), X1 1645 PCMPEQB X0, X1 1646 PMOVMSKB X1, AX 1647 XORQ $0xffff, AX 1648 JNE diff64 1649 1650 ADDQ $64, SI 1651 ADDQ $64, DI 1652 SUBQ $64, R8 1653 CMPQ R8, $64 1654 JBE loop 1655 JMP big_loop 1656 1657 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 1658 MOVQ s+0(FP), SI 1659 MOVQ s_len+8(FP), BX 1660 MOVB c+24(FP), AL 1661 LEAQ ret+32(FP), R8 1662 JMP runtime·indexbytebody(SB) 1663 1664 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 1665 MOVQ s+0(FP), SI 1666 MOVQ s_len+8(FP), BX 1667 MOVB c+16(FP), AL 1668 LEAQ ret+24(FP), R8 1669 JMP runtime·indexbytebody(SB) 1670 1671 // input: 1672 // SI: data 1673 // BX: data len 1674 // AL: byte sought 1675 // R8: address to put result 1676 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 1677 MOVQ SI, DI 1678 1679 CMPQ BX, $16 1680 JLT small 1681 1682 // round up to first 16-byte boundary 1683 TESTQ $15, SI 1684 JZ aligned 1685 MOVQ SI, CX 1686 ANDQ $~15, CX 1687 ADDQ $16, CX 1688 1689 // search the beginning 1690 SUBQ SI, CX 1691 REPN; SCASB 1692 JZ success 1693 1694 // DI is 16-byte aligned; get ready to search using SSE instructions 1695 aligned: 1696 // round down to last 16-byte boundary 1697 MOVQ BX, R11 1698 ADDQ SI, R11 1699 ANDQ $~15, R11 1700 1701 // shuffle X0 around so that each byte contains c 1702 MOVD AX, X0 1703 PUNPCKLBW X0, X0 1704 PUNPCKLBW X0, X0 1705 PSHUFL $0, X0, X0 1706 JMP condition 1707 1708 sse: 1709 // move the next 16-byte chunk of the buffer into X1 1710 MOVO (DI), X1 1711 // compare bytes in X0 to X1 1712 PCMPEQB X0, X1 1713 // take the top bit of each byte in X1 and put the result in DX 1714 PMOVMSKB X1, DX 1715 TESTL DX, DX 1716 JNZ ssesuccess 1717 ADDQ $16, DI 1718 1719 condition: 1720 CMPQ DI, R11 1721 JLT sse 1722 1723 // search the end 1724 MOVQ SI, CX 1725 ADDQ BX, CX 1726 SUBQ R11, CX 1727 // if CX == 0, the zero flag will be set and we'll end up 1728 // returning a false success 1729 JZ failure 1730 REPN; SCASB 1731 JZ success 1732 1733 failure: 1734 MOVQ $-1, (R8) 1735 RET 1736 1737 // handle for lengths < 16 1738 small: 1739 MOVQ BX, CX 1740 REPN; SCASB 1741 JZ success 1742 MOVQ $-1, (R8) 1743 RET 1744 1745 // we've found the chunk containing the byte 1746 // now just figure out which specific byte it is 1747 ssesuccess: 1748 // get the index of the least significant set bit 1749 BSFW DX, DX 1750 SUBQ SI, DI 1751 ADDQ DI, DX 1752 MOVQ DX, (R8) 1753 RET 1754 1755 success: 1756 SUBQ SI, DI 1757 SUBL $1, DI 1758 MOVQ DI, (R8) 1759 RET 1760 1761 TEXT bytes·Equal(SB),NOSPLIT,$0-49 1762 MOVQ a_len+8(FP), BX 1763 MOVQ b_len+32(FP), CX 1764 CMPQ BX, CX 1765 JNE eqret 1766 MOVQ a+0(FP), SI 1767 MOVQ b+24(FP), DI 1768 LEAQ ret+48(FP), AX 1769 JMP runtime·memeqbody(SB) 1770 eqret: 1771 MOVB $0, ret+48(FP) 1772 RET 1773 1774 TEXT runtime·fastrand1(SB), NOSPLIT, $0-4 1775 get_tls(CX) 1776 MOVQ g(CX), AX 1777 MOVQ g_m(AX), AX 1778 MOVL m_fastrand(AX), DX 1779 ADDL DX, DX 1780 MOVL DX, BX 1781 XORL $0x88888eef, DX 1782 CMOVLMI BX, DX 1783 MOVL DX, m_fastrand(AX) 1784 MOVL DX, ret+0(FP) 1785 RET 1786 1787 TEXT runtime·return0(SB), NOSPLIT, $0 1788 MOVL $0, AX 1789 RET 1790 1791 1792 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 1793 // Must obey the gcc calling convention. 1794 TEXT _cgo_topofstack(SB),NOSPLIT,$0 1795 get_tls(CX) 1796 MOVQ g(CX), AX 1797 MOVQ g_m(AX), AX 1798 MOVQ m_curg(AX), AX 1799 MOVQ (g_stack+stack_hi)(AX), AX 1800 RET 1801 1802 // The top-most function running on a goroutine 1803 // returns to goexit+PCQuantum. 1804 TEXT runtime·goexit(SB),NOSPLIT,$0-0 1805 BYTE $0x90 // NOP 1806 CALL runtime·goexit1(SB) // does not return 1807 // traceback from goexit1 must hit code range of goexit 1808 BYTE $0x90 // NOP 1809 1810 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8 1811 MOVQ addr+0(FP), AX 1812 PREFETCHT0 (AX) 1813 RET 1814 1815 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8 1816 MOVQ addr+0(FP), AX 1817 PREFETCHT1 (AX) 1818 RET 1819 1820 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8 1821 MOVQ addr+0(FP), AX 1822 PREFETCHT2 (AX) 1823 RET 1824 1825 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8 1826 MOVQ addr+0(FP), AX 1827 PREFETCHNTA (AX) 1828 RET 1829 1830 // This is called from .init_array and follows the platform, not Go, ABI. 1831 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 1832 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 1833 MOVQ runtime·lastmoduledatap(SB), AX 1834 MOVQ DI, moduledata_next(AX) 1835 MOVQ DI, runtime·lastmoduledatap(SB) 1836 POPQ R15 1837 RET