github.com/q45/go@v0.0.0-20151101211701-a4fb8c13db3f/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVQ $0, AX 30 CPUID 31 CMPQ AX, $0 32 JE nocpuinfo 33 34 // Figure out how to serialize RDTSC. 35 // On Intel processors LFENCE is enough. AMD requires MFENCE. 36 // Don't know about the rest, so let's do MFENCE. 37 CMPL BX, $0x756E6547 // "Genu" 38 JNE notintel 39 CMPL DX, $0x49656E69 // "ineI" 40 JNE notintel 41 CMPL CX, $0x6C65746E // "ntel" 42 JNE notintel 43 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 44 notintel: 45 46 MOVQ $1, AX 47 CPUID 48 MOVL CX, runtime·cpuid_ecx(SB) 49 MOVL DX, runtime·cpuid_edx(SB) 50 nocpuinfo: 51 52 // if there is an _cgo_init, call it. 53 MOVQ _cgo_init(SB), AX 54 TESTQ AX, AX 55 JZ needtls 56 // g0 already in DI 57 MOVQ DI, CX // Win64 uses CX for first parameter 58 MOVQ $setg_gcc<>(SB), SI 59 CALL AX 60 61 // update stackguard after _cgo_init 62 MOVQ $runtime·g0(SB), CX 63 MOVQ (g_stack+stack_lo)(CX), AX 64 ADDQ $const__StackGuard, AX 65 MOVQ AX, g_stackguard0(CX) 66 MOVQ AX, g_stackguard1(CX) 67 68 #ifndef GOOS_windows 69 JMP ok 70 #endif 71 needtls: 72 #ifdef GOOS_plan9 73 // skip TLS setup on Plan 9 74 JMP ok 75 #endif 76 #ifdef GOOS_solaris 77 // skip TLS setup on Solaris 78 JMP ok 79 #endif 80 81 LEAQ runtime·tls0(SB), DI 82 CALL runtime·settls(SB) 83 84 // store through it, to make sure it works 85 get_tls(BX) 86 MOVQ $0x123, g(BX) 87 MOVQ runtime·tls0(SB), AX 88 CMPQ AX, $0x123 89 JEQ 2(PC) 90 MOVL AX, 0 // abort 91 ok: 92 // set the per-goroutine and per-mach "registers" 93 get_tls(BX) 94 LEAQ runtime·g0(SB), CX 95 MOVQ CX, g(BX) 96 LEAQ runtime·m0(SB), AX 97 98 // save m->g0 = g0 99 MOVQ CX, m_g0(AX) 100 // save m0 to g0->m 101 MOVQ AX, g_m(CX) 102 103 CLD // convention is D is always left cleared 104 CALL runtime·check(SB) 105 106 MOVL 16(SP), AX // copy argc 107 MOVL AX, 0(SP) 108 MOVQ 24(SP), AX // copy argv 109 MOVQ AX, 8(SP) 110 CALL runtime·args(SB) 111 CALL runtime·osinit(SB) 112 CALL runtime·schedinit(SB) 113 114 // create a new goroutine to start program 115 MOVQ $runtime·mainPC(SB), AX // entry 116 PUSHQ AX 117 PUSHQ $0 // arg size 118 CALL runtime·newproc(SB) 119 POPQ AX 120 POPQ AX 121 122 // start this M 123 CALL runtime·mstart(SB) 124 125 MOVL $0xf1, 0xf1 // crash 126 RET 127 128 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 129 GLOBL runtime·mainPC(SB),RODATA,$8 130 131 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 132 BYTE $0xcc 133 RET 134 135 TEXT runtime·asminit(SB),NOSPLIT,$0-0 136 // No per-thread init. 137 RET 138 139 /* 140 * go-routine 141 */ 142 143 // void gosave(Gobuf*) 144 // save state in Gobuf; setjmp 145 TEXT runtime·gosave(SB), NOSPLIT, $0-8 146 MOVQ buf+0(FP), AX // gobuf 147 LEAQ buf+0(FP), BX // caller's SP 148 MOVQ BX, gobuf_sp(AX) 149 MOVQ 0(SP), BX // caller's PC 150 MOVQ BX, gobuf_pc(AX) 151 MOVQ $0, gobuf_ret(AX) 152 MOVQ $0, gobuf_ctxt(AX) 153 MOVQ BP, gobuf_bp(AX) 154 get_tls(CX) 155 MOVQ g(CX), BX 156 MOVQ BX, gobuf_g(AX) 157 RET 158 159 // void gogo(Gobuf*) 160 // restore state from Gobuf; longjmp 161 TEXT runtime·gogo(SB), NOSPLIT, $0-8 162 MOVQ buf+0(FP), BX // gobuf 163 MOVQ gobuf_g(BX), DX 164 MOVQ 0(DX), CX // make sure g != nil 165 get_tls(CX) 166 MOVQ DX, g(CX) 167 MOVQ gobuf_sp(BX), SP // restore SP 168 MOVQ gobuf_ret(BX), AX 169 MOVQ gobuf_ctxt(BX), DX 170 MOVQ gobuf_bp(BX), BP 171 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 172 MOVQ $0, gobuf_ret(BX) 173 MOVQ $0, gobuf_ctxt(BX) 174 MOVQ $0, gobuf_bp(BX) 175 MOVQ gobuf_pc(BX), BX 176 JMP BX 177 178 // func mcall(fn func(*g)) 179 // Switch to m->g0's stack, call fn(g). 180 // Fn must never return. It should gogo(&g->sched) 181 // to keep running g. 182 TEXT runtime·mcall(SB), NOSPLIT, $0-8 183 MOVQ fn+0(FP), DI 184 185 get_tls(CX) 186 MOVQ g(CX), AX // save state in g->sched 187 MOVQ 0(SP), BX // caller's PC 188 MOVQ BX, (g_sched+gobuf_pc)(AX) 189 LEAQ fn+0(FP), BX // caller's SP 190 MOVQ BX, (g_sched+gobuf_sp)(AX) 191 MOVQ AX, (g_sched+gobuf_g)(AX) 192 MOVQ BP, (g_sched+gobuf_bp)(AX) 193 194 // switch to m->g0 & its stack, call fn 195 MOVQ g(CX), BX 196 MOVQ g_m(BX), BX 197 MOVQ m_g0(BX), SI 198 CMPQ SI, AX // if g == m->g0 call badmcall 199 JNE 3(PC) 200 MOVQ $runtime·badmcall(SB), AX 201 JMP AX 202 MOVQ SI, g(CX) // g = m->g0 203 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 204 PUSHQ AX 205 MOVQ DI, DX 206 MOVQ 0(DI), DI 207 CALL DI 208 POPQ AX 209 MOVQ $runtime·badmcall2(SB), AX 210 JMP AX 211 RET 212 213 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 214 // of the G stack. We need to distinguish the routine that 215 // lives at the bottom of the G stack from the one that lives 216 // at the top of the system stack because the one at the top of 217 // the system stack terminates the stack walk (see topofstack()). 218 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 219 RET 220 221 // func systemstack(fn func()) 222 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 223 MOVQ fn+0(FP), DI // DI = fn 224 get_tls(CX) 225 MOVQ g(CX), AX // AX = g 226 MOVQ g_m(AX), BX // BX = m 227 228 MOVQ m_gsignal(BX), DX // DX = gsignal 229 CMPQ AX, DX 230 JEQ noswitch 231 232 MOVQ m_g0(BX), DX // DX = g0 233 CMPQ AX, DX 234 JEQ noswitch 235 236 MOVQ m_curg(BX), R8 237 CMPQ AX, R8 238 JEQ switch 239 240 // Bad: g is not gsignal, not g0, not curg. What is it? 241 MOVQ $runtime·badsystemstack(SB), AX 242 CALL AX 243 244 switch: 245 // save our state in g->sched. Pretend to 246 // be systemstack_switch if the G stack is scanned. 247 MOVQ $runtime·systemstack_switch(SB), SI 248 MOVQ SI, (g_sched+gobuf_pc)(AX) 249 MOVQ SP, (g_sched+gobuf_sp)(AX) 250 MOVQ AX, (g_sched+gobuf_g)(AX) 251 MOVQ BP, (g_sched+gobuf_bp)(AX) 252 253 // switch to g0 254 MOVQ DX, g(CX) 255 MOVQ (g_sched+gobuf_sp)(DX), BX 256 // make it look like mstart called systemstack on g0, to stop traceback 257 SUBQ $8, BX 258 MOVQ $runtime·mstart(SB), DX 259 MOVQ DX, 0(BX) 260 MOVQ BX, SP 261 262 // call target function 263 MOVQ DI, DX 264 MOVQ 0(DI), DI 265 CALL DI 266 267 // switch back to g 268 get_tls(CX) 269 MOVQ g(CX), AX 270 MOVQ g_m(AX), BX 271 MOVQ m_curg(BX), AX 272 MOVQ AX, g(CX) 273 MOVQ (g_sched+gobuf_sp)(AX), SP 274 MOVQ $0, (g_sched+gobuf_sp)(AX) 275 RET 276 277 noswitch: 278 // already on m stack, just call directly 279 MOVQ DI, DX 280 MOVQ 0(DI), DI 281 CALL DI 282 RET 283 284 /* 285 * support for morestack 286 */ 287 288 // Called during function prolog when more stack is needed. 289 // 290 // The traceback routines see morestack on a g0 as being 291 // the top of a stack (for example, morestack calling newstack 292 // calling the scheduler calling newm calling gc), so we must 293 // record an argument size. For that purpose, it has no arguments. 294 TEXT runtime·morestack(SB),NOSPLIT,$0-0 295 // Cannot grow scheduler stack (m->g0). 296 get_tls(CX) 297 MOVQ g(CX), BX 298 MOVQ g_m(BX), BX 299 MOVQ m_g0(BX), SI 300 CMPQ g(CX), SI 301 JNE 2(PC) 302 INT $3 303 304 // Cannot grow signal stack (m->gsignal). 305 MOVQ m_gsignal(BX), SI 306 CMPQ g(CX), SI 307 JNE 2(PC) 308 INT $3 309 310 // Called from f. 311 // Set m->morebuf to f's caller. 312 MOVQ 8(SP), AX // f's caller's PC 313 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 314 LEAQ 16(SP), AX // f's caller's SP 315 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 316 get_tls(CX) 317 MOVQ g(CX), SI 318 MOVQ SI, (m_morebuf+gobuf_g)(BX) 319 320 // Set g->sched to context in f. 321 MOVQ 0(SP), AX // f's PC 322 MOVQ AX, (g_sched+gobuf_pc)(SI) 323 MOVQ SI, (g_sched+gobuf_g)(SI) 324 LEAQ 8(SP), AX // f's SP 325 MOVQ AX, (g_sched+gobuf_sp)(SI) 326 MOVQ DX, (g_sched+gobuf_ctxt)(SI) 327 MOVQ BP, (g_sched+gobuf_bp)(SI) 328 329 // Call newstack on m->g0's stack. 330 MOVQ m_g0(BX), BX 331 MOVQ BX, g(CX) 332 MOVQ (g_sched+gobuf_sp)(BX), SP 333 CALL runtime·newstack(SB) 334 MOVQ $0, 0x1003 // crash if newstack returns 335 RET 336 337 // morestack but not preserving ctxt. 338 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 339 MOVL $0, DX 340 JMP runtime·morestack(SB) 341 342 TEXT runtime·stackBarrier(SB),NOSPLIT,$0 343 // We came here via a RET to an overwritten return PC. 344 // AX may be live. Other registers are available. 345 346 // Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal. 347 get_tls(CX) 348 MOVQ g(CX), CX 349 MOVQ (g_stkbar+slice_array)(CX), DX 350 MOVQ g_stkbarPos(CX), BX 351 IMULQ $stkbar__size, BX // Too big for SIB. 352 MOVQ stkbar_savedLRPtr(DX)(BX*1), R8 353 MOVQ stkbar_savedLRVal(DX)(BX*1), BX 354 // Assert that we're popping the right saved LR. 355 CMPQ R8, SP 356 JNE 2(PC) 357 MOVL $0, 0 358 // Record that this stack barrier was hit. 359 ADDQ $1, g_stkbarPos(CX) 360 // Jump to the original return PC. 361 JMP BX 362 363 // reflectcall: call a function with the given argument list 364 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 365 // we don't have variable-sized frames, so we use a small number 366 // of constant-sized-frame functions to encode a few bits of size in the pc. 367 // Caution: ugly multiline assembly macros in your future! 368 369 #define DISPATCH(NAME,MAXSIZE) \ 370 CMPQ CX, $MAXSIZE; \ 371 JA 3(PC); \ 372 MOVQ $NAME(SB), AX; \ 373 JMP AX 374 // Note: can't just "JMP NAME(SB)" - bad inlining results. 375 376 TEXT reflect·call(SB), NOSPLIT, $0-0 377 JMP ·reflectcall(SB) 378 379 TEXT ·reflectcall(SB), NOSPLIT, $0-32 380 MOVLQZX argsize+24(FP), CX 381 // NOTE(rsc): No call16, because CALLFN needs four words 382 // of argument space to invoke callwritebarrier. 383 DISPATCH(runtime·call32, 32) 384 DISPATCH(runtime·call64, 64) 385 DISPATCH(runtime·call128, 128) 386 DISPATCH(runtime·call256, 256) 387 DISPATCH(runtime·call512, 512) 388 DISPATCH(runtime·call1024, 1024) 389 DISPATCH(runtime·call2048, 2048) 390 DISPATCH(runtime·call4096, 4096) 391 DISPATCH(runtime·call8192, 8192) 392 DISPATCH(runtime·call16384, 16384) 393 DISPATCH(runtime·call32768, 32768) 394 DISPATCH(runtime·call65536, 65536) 395 DISPATCH(runtime·call131072, 131072) 396 DISPATCH(runtime·call262144, 262144) 397 DISPATCH(runtime·call524288, 524288) 398 DISPATCH(runtime·call1048576, 1048576) 399 DISPATCH(runtime·call2097152, 2097152) 400 DISPATCH(runtime·call4194304, 4194304) 401 DISPATCH(runtime·call8388608, 8388608) 402 DISPATCH(runtime·call16777216, 16777216) 403 DISPATCH(runtime·call33554432, 33554432) 404 DISPATCH(runtime·call67108864, 67108864) 405 DISPATCH(runtime·call134217728, 134217728) 406 DISPATCH(runtime·call268435456, 268435456) 407 DISPATCH(runtime·call536870912, 536870912) 408 DISPATCH(runtime·call1073741824, 1073741824) 409 MOVQ $runtime·badreflectcall(SB), AX 410 JMP AX 411 412 #define CALLFN(NAME,MAXSIZE) \ 413 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 414 NO_LOCAL_POINTERS; \ 415 /* copy arguments to stack */ \ 416 MOVQ argptr+16(FP), SI; \ 417 MOVLQZX argsize+24(FP), CX; \ 418 MOVQ SP, DI; \ 419 REP;MOVSB; \ 420 /* call function */ \ 421 MOVQ f+8(FP), DX; \ 422 PCDATA $PCDATA_StackMapIndex, $0; \ 423 CALL (DX); \ 424 /* copy return values back */ \ 425 MOVQ argptr+16(FP), DI; \ 426 MOVLQZX argsize+24(FP), CX; \ 427 MOVLQZX retoffset+28(FP), BX; \ 428 MOVQ SP, SI; \ 429 ADDQ BX, DI; \ 430 ADDQ BX, SI; \ 431 SUBQ BX, CX; \ 432 REP;MOVSB; \ 433 /* execute write barrier updates */ \ 434 MOVQ argtype+0(FP), DX; \ 435 MOVQ argptr+16(FP), DI; \ 436 MOVLQZX argsize+24(FP), CX; \ 437 MOVLQZX retoffset+28(FP), BX; \ 438 MOVQ DX, 0(SP); \ 439 MOVQ DI, 8(SP); \ 440 MOVQ CX, 16(SP); \ 441 MOVQ BX, 24(SP); \ 442 CALL runtime·callwritebarrier(SB); \ 443 RET 444 445 CALLFN(·call32, 32) 446 CALLFN(·call64, 64) 447 CALLFN(·call128, 128) 448 CALLFN(·call256, 256) 449 CALLFN(·call512, 512) 450 CALLFN(·call1024, 1024) 451 CALLFN(·call2048, 2048) 452 CALLFN(·call4096, 4096) 453 CALLFN(·call8192, 8192) 454 CALLFN(·call16384, 16384) 455 CALLFN(·call32768, 32768) 456 CALLFN(·call65536, 65536) 457 CALLFN(·call131072, 131072) 458 CALLFN(·call262144, 262144) 459 CALLFN(·call524288, 524288) 460 CALLFN(·call1048576, 1048576) 461 CALLFN(·call2097152, 2097152) 462 CALLFN(·call4194304, 4194304) 463 CALLFN(·call8388608, 8388608) 464 CALLFN(·call16777216, 16777216) 465 CALLFN(·call33554432, 33554432) 466 CALLFN(·call67108864, 67108864) 467 CALLFN(·call134217728, 134217728) 468 CALLFN(·call268435456, 268435456) 469 CALLFN(·call536870912, 536870912) 470 CALLFN(·call1073741824, 1073741824) 471 472 // bool cas(int32 *val, int32 old, int32 new) 473 // Atomically: 474 // if(*val == old){ 475 // *val = new; 476 // return 1; 477 // } else 478 // return 0; 479 TEXT runtime·cas(SB), NOSPLIT, $0-17 480 MOVQ ptr+0(FP), BX 481 MOVL old+8(FP), AX 482 MOVL new+12(FP), CX 483 LOCK 484 CMPXCHGL CX, 0(BX) 485 SETEQ ret+16(FP) 486 RET 487 488 // bool runtime·cas64(uint64 *val, uint64 old, uint64 new) 489 // Atomically: 490 // if(*val == *old){ 491 // *val = new; 492 // return 1; 493 // } else { 494 // return 0; 495 // } 496 TEXT runtime·cas64(SB), NOSPLIT, $0-25 497 MOVQ ptr+0(FP), BX 498 MOVQ old+8(FP), AX 499 MOVQ new+16(FP), CX 500 LOCK 501 CMPXCHGQ CX, 0(BX) 502 SETEQ ret+24(FP) 503 RET 504 505 TEXT runtime·casuintptr(SB), NOSPLIT, $0-25 506 JMP runtime·cas64(SB) 507 508 TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16 509 JMP runtime·atomicload64(SB) 510 511 TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16 512 JMP runtime·atomicload64(SB) 513 514 TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16 515 JMP runtime·atomicstore64(SB) 516 517 // bool casp(void **val, void *old, void *new) 518 // Atomically: 519 // if(*val == old){ 520 // *val = new; 521 // return 1; 522 // } else 523 // return 0; 524 TEXT runtime·casp1(SB), NOSPLIT, $0-25 525 MOVQ ptr+0(FP), BX 526 MOVQ old+8(FP), AX 527 MOVQ new+16(FP), CX 528 LOCK 529 CMPXCHGQ CX, 0(BX) 530 SETEQ ret+24(FP) 531 RET 532 533 // uint32 xadd(uint32 volatile *val, int32 delta) 534 // Atomically: 535 // *val += delta; 536 // return *val; 537 TEXT runtime·xadd(SB), NOSPLIT, $0-20 538 MOVQ ptr+0(FP), BX 539 MOVL delta+8(FP), AX 540 MOVL AX, CX 541 LOCK 542 XADDL AX, 0(BX) 543 ADDL CX, AX 544 MOVL AX, ret+16(FP) 545 RET 546 547 TEXT runtime·xadd64(SB), NOSPLIT, $0-24 548 MOVQ ptr+0(FP), BX 549 MOVQ delta+8(FP), AX 550 MOVQ AX, CX 551 LOCK 552 XADDQ AX, 0(BX) 553 ADDQ CX, AX 554 MOVQ AX, ret+16(FP) 555 RET 556 557 TEXT runtime·xadduintptr(SB), NOSPLIT, $0-24 558 JMP runtime·xadd64(SB) 559 560 TEXT runtime·xchg(SB), NOSPLIT, $0-20 561 MOVQ ptr+0(FP), BX 562 MOVL new+8(FP), AX 563 XCHGL AX, 0(BX) 564 MOVL AX, ret+16(FP) 565 RET 566 567 TEXT runtime·xchg64(SB), NOSPLIT, $0-24 568 MOVQ ptr+0(FP), BX 569 MOVQ new+8(FP), AX 570 XCHGQ AX, 0(BX) 571 MOVQ AX, ret+16(FP) 572 RET 573 574 TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24 575 JMP runtime·xchg64(SB) 576 577 TEXT runtime·procyield(SB),NOSPLIT,$0-0 578 MOVL cycles+0(FP), AX 579 again: 580 PAUSE 581 SUBL $1, AX 582 JNZ again 583 RET 584 585 TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16 586 MOVQ ptr+0(FP), BX 587 MOVQ val+8(FP), AX 588 XCHGQ AX, 0(BX) 589 RET 590 591 TEXT runtime·atomicstore(SB), NOSPLIT, $0-12 592 MOVQ ptr+0(FP), BX 593 MOVL val+8(FP), AX 594 XCHGL AX, 0(BX) 595 RET 596 597 TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16 598 MOVQ ptr+0(FP), BX 599 MOVQ val+8(FP), AX 600 XCHGQ AX, 0(BX) 601 RET 602 603 // void runtime·atomicor8(byte volatile*, byte); 604 TEXT runtime·atomicor8(SB), NOSPLIT, $0-9 605 MOVQ ptr+0(FP), AX 606 MOVB val+8(FP), BX 607 LOCK 608 ORB BX, (AX) 609 RET 610 611 // void runtime·atomicand8(byte volatile*, byte); 612 TEXT runtime·atomicand8(SB), NOSPLIT, $0-9 613 MOVQ ptr+0(FP), AX 614 MOVB val+8(FP), BX 615 LOCK 616 ANDB BX, (AX) 617 RET 618 619 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 620 // Stores are already ordered on x86, so this is just a 621 // compile barrier. 622 RET 623 624 // void jmpdefer(fn, sp); 625 // called from deferreturn. 626 // 1. pop the caller 627 // 2. sub 5 bytes from the callers return 628 // 3. jmp to the argument 629 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 630 MOVQ fv+0(FP), DX // fn 631 MOVQ argp+8(FP), BX // caller sp 632 LEAQ -8(BX), SP // caller sp after CALL 633 SUBQ $5, (SP) // return to CALL again 634 MOVQ 0(DX), BX 635 JMP BX // but first run the deferred function 636 637 // Save state of caller into g->sched. Smashes R8, R9. 638 TEXT gosave<>(SB),NOSPLIT,$0 639 get_tls(R8) 640 MOVQ g(R8), R8 641 MOVQ 0(SP), R9 642 MOVQ R9, (g_sched+gobuf_pc)(R8) 643 LEAQ 8(SP), R9 644 MOVQ R9, (g_sched+gobuf_sp)(R8) 645 MOVQ $0, (g_sched+gobuf_ret)(R8) 646 MOVQ $0, (g_sched+gobuf_ctxt)(R8) 647 MOVQ BP, (g_sched+gobuf_bp)(R8) 648 RET 649 650 // func asmcgocall(fn, arg unsafe.Pointer) int32 651 // Call fn(arg) on the scheduler stack, 652 // aligned appropriately for the gcc ABI. 653 // See cgocall.go for more details. 654 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 655 MOVQ fn+0(FP), AX 656 MOVQ arg+8(FP), BX 657 658 MOVQ SP, DX 659 660 // Figure out if we need to switch to m->g0 stack. 661 // We get called to create new OS threads too, and those 662 // come in on the m->g0 stack already. 663 get_tls(CX) 664 MOVQ g(CX), R8 665 MOVQ g_m(R8), R8 666 MOVQ m_g0(R8), SI 667 MOVQ g(CX), DI 668 CMPQ SI, DI 669 JEQ nosave 670 MOVQ m_gsignal(R8), SI 671 CMPQ SI, DI 672 JEQ nosave 673 674 MOVQ m_g0(R8), SI 675 CALL gosave<>(SB) 676 MOVQ SI, g(CX) 677 MOVQ (g_sched+gobuf_sp)(SI), SP 678 nosave: 679 680 // Now on a scheduling stack (a pthread-created stack). 681 // Make sure we have enough room for 4 stack-backed fast-call 682 // registers as per windows amd64 calling convention. 683 SUBQ $64, SP 684 ANDQ $~15, SP // alignment for gcc ABI 685 MOVQ DI, 48(SP) // save g 686 MOVQ (g_stack+stack_hi)(DI), DI 687 SUBQ DX, DI 688 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 689 MOVQ BX, DI // DI = first argument in AMD64 ABI 690 MOVQ BX, CX // CX = first argument in Win64 691 CALL AX 692 693 // Restore registers, g, stack pointer. 694 get_tls(CX) 695 MOVQ 48(SP), DI 696 MOVQ (g_stack+stack_hi)(DI), SI 697 SUBQ 40(SP), SI 698 MOVQ DI, g(CX) 699 MOVQ SI, SP 700 701 MOVL AX, ret+16(FP) 702 RET 703 704 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize) 705 // Turn the fn into a Go func (by taking its address) and call 706 // cgocallback_gofunc. 707 TEXT runtime·cgocallback(SB),NOSPLIT,$24-24 708 LEAQ fn+0(FP), AX 709 MOVQ AX, 0(SP) 710 MOVQ frame+8(FP), AX 711 MOVQ AX, 8(SP) 712 MOVQ framesize+16(FP), AX 713 MOVQ AX, 16(SP) 714 MOVQ $runtime·cgocallback_gofunc(SB), AX 715 CALL AX 716 RET 717 718 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) 719 // See cgocall.go for more details. 720 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24 721 NO_LOCAL_POINTERS 722 723 // If g is nil, Go did not create the current thread. 724 // Call needm to obtain one m for temporary use. 725 // In this case, we're running on the thread stack, so there's 726 // lots of space, but the linker doesn't know. Hide the call from 727 // the linker analysis by using an indirect call through AX. 728 get_tls(CX) 729 #ifdef GOOS_windows 730 MOVL $0, BX 731 CMPQ CX, $0 732 JEQ 2(PC) 733 #endif 734 MOVQ g(CX), BX 735 CMPQ BX, $0 736 JEQ needm 737 MOVQ g_m(BX), BX 738 MOVQ BX, R8 // holds oldm until end of function 739 JMP havem 740 needm: 741 MOVQ $0, 0(SP) 742 MOVQ $runtime·needm(SB), AX 743 CALL AX 744 MOVQ 0(SP), R8 745 get_tls(CX) 746 MOVQ g(CX), BX 747 MOVQ g_m(BX), BX 748 749 // Set m->sched.sp = SP, so that if a panic happens 750 // during the function we are about to execute, it will 751 // have a valid SP to run on the g0 stack. 752 // The next few lines (after the havem label) 753 // will save this SP onto the stack and then write 754 // the same SP back to m->sched.sp. That seems redundant, 755 // but if an unrecovered panic happens, unwindm will 756 // restore the g->sched.sp from the stack location 757 // and then systemstack will try to use it. If we don't set it here, 758 // that restored SP will be uninitialized (typically 0) and 759 // will not be usable. 760 MOVQ m_g0(BX), SI 761 MOVQ SP, (g_sched+gobuf_sp)(SI) 762 763 havem: 764 // Now there's a valid m, and we're running on its m->g0. 765 // Save current m->g0->sched.sp on stack and then set it to SP. 766 // Save current sp in m->g0->sched.sp in preparation for 767 // switch back to m->curg stack. 768 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 769 MOVQ m_g0(BX), SI 770 MOVQ (g_sched+gobuf_sp)(SI), AX 771 MOVQ AX, 0(SP) 772 MOVQ SP, (g_sched+gobuf_sp)(SI) 773 774 // Switch to m->curg stack and call runtime.cgocallbackg. 775 // Because we are taking over the execution of m->curg 776 // but *not* resuming what had been running, we need to 777 // save that information (m->curg->sched) so we can restore it. 778 // We can restore m->curg->sched.sp easily, because calling 779 // runtime.cgocallbackg leaves SP unchanged upon return. 780 // To save m->curg->sched.pc, we push it onto the stack. 781 // This has the added benefit that it looks to the traceback 782 // routine like cgocallbackg is going to return to that 783 // PC (because the frame we allocate below has the same 784 // size as cgocallback_gofunc's frame declared above) 785 // so that the traceback will seamlessly trace back into 786 // the earlier calls. 787 // 788 // In the new goroutine, 0(SP) holds the saved R8. 789 MOVQ m_curg(BX), SI 790 MOVQ SI, g(CX) 791 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 792 MOVQ (g_sched+gobuf_pc)(SI), BX 793 MOVQ BX, -8(DI) 794 // Compute the size of the frame, including return PC and, if 795 // GOEXPERIMENT=framepointer, the saved based pointer 796 LEAQ fv+0(FP), AX 797 SUBQ SP, AX 798 SUBQ AX, DI 799 MOVQ DI, SP 800 801 MOVQ R8, 0(SP) 802 CALL runtime·cgocallbackg(SB) 803 MOVQ 0(SP), R8 804 805 // Compute the size of the frame again. FP and SP have 806 // completely different values here than they did above, 807 // but only their difference matters. 808 LEAQ fv+0(FP), AX 809 SUBQ SP, AX 810 811 // Restore g->sched (== m->curg->sched) from saved values. 812 get_tls(CX) 813 MOVQ g(CX), SI 814 MOVQ SP, DI 815 ADDQ AX, DI 816 MOVQ -8(DI), BX 817 MOVQ BX, (g_sched+gobuf_pc)(SI) 818 MOVQ DI, (g_sched+gobuf_sp)(SI) 819 820 // Switch back to m->g0's stack and restore m->g0->sched.sp. 821 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 822 // so we do not have to restore it.) 823 MOVQ g(CX), BX 824 MOVQ g_m(BX), BX 825 MOVQ m_g0(BX), SI 826 MOVQ SI, g(CX) 827 MOVQ (g_sched+gobuf_sp)(SI), SP 828 MOVQ 0(SP), AX 829 MOVQ AX, (g_sched+gobuf_sp)(SI) 830 831 // If the m on entry was nil, we called needm above to borrow an m 832 // for the duration of the call. Since the call is over, return it with dropm. 833 CMPQ R8, $0 834 JNE 3(PC) 835 MOVQ $runtime·dropm(SB), AX 836 CALL AX 837 838 // Done! 839 RET 840 841 // void setg(G*); set g. for use by needm. 842 TEXT runtime·setg(SB), NOSPLIT, $0-8 843 MOVQ gg+0(FP), BX 844 #ifdef GOOS_windows 845 CMPQ BX, $0 846 JNE settls 847 MOVQ $0, 0x28(GS) 848 RET 849 settls: 850 MOVQ g_m(BX), AX 851 LEAQ m_tls(AX), AX 852 MOVQ AX, 0x28(GS) 853 #endif 854 get_tls(CX) 855 MOVQ BX, g(CX) 856 RET 857 858 // void setg_gcc(G*); set g called from gcc. 859 TEXT setg_gcc<>(SB),NOSPLIT,$0 860 get_tls(AX) 861 MOVQ DI, g(AX) 862 RET 863 864 // check that SP is in range [g->stack.lo, g->stack.hi) 865 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 866 get_tls(CX) 867 MOVQ g(CX), AX 868 CMPQ (g_stack+stack_hi)(AX), SP 869 JHI 2(PC) 870 INT $3 871 CMPQ SP, (g_stack+stack_lo)(AX) 872 JHI 2(PC) 873 INT $3 874 RET 875 876 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 877 MOVQ argp+0(FP),AX // addr of first arg 878 MOVQ -8(AX),AX // get calling pc 879 CMPQ AX, runtime·stackBarrierPC(SB) 880 JNE nobar 881 // Get original return PC. 882 CALL runtime·nextBarrierPC(SB) 883 MOVQ 0(SP), AX 884 nobar: 885 MOVQ AX, ret+8(FP) 886 RET 887 888 TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16 889 MOVQ argp+0(FP),AX // addr of first arg 890 MOVQ pc+8(FP), BX 891 MOVQ -8(AX), CX 892 CMPQ CX, runtime·stackBarrierPC(SB) 893 JEQ setbar 894 MOVQ BX, -8(AX) // set calling pc 895 RET 896 setbar: 897 // Set the stack barrier return PC. 898 MOVQ BX, 0(SP) 899 CALL runtime·setNextBarrierPC(SB) 900 RET 901 902 TEXT runtime·getcallersp(SB),NOSPLIT,$0-16 903 MOVQ argp+0(FP), AX 904 MOVQ AX, ret+8(FP) 905 RET 906 907 // func cputicks() int64 908 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 909 CMPB runtime·lfenceBeforeRdtsc(SB), $1 910 JNE mfence 911 BYTE $0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE 912 JMP done 913 mfence: 914 BYTE $0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE 915 done: 916 RDTSC 917 SHLQ $32, DX 918 ADDQ DX, AX 919 MOVQ AX, ret+0(FP) 920 RET 921 922 // memhash_varlen(p unsafe.Pointer, h seed) uintptr 923 // redirects to memhash(p, h, size) using the size 924 // stored in the closure. 925 TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24 926 GO_ARGS 927 NO_LOCAL_POINTERS 928 MOVQ p+0(FP), AX 929 MOVQ h+8(FP), BX 930 MOVQ 8(DX), CX 931 MOVQ AX, 0(SP) 932 MOVQ BX, 8(SP) 933 MOVQ CX, 16(SP) 934 CALL runtime·memhash(SB) 935 MOVQ 24(SP), AX 936 MOVQ AX, ret+16(FP) 937 RET 938 939 // hash function using AES hardware instructions 940 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 941 MOVQ p+0(FP), AX // ptr to data 942 MOVQ s+16(FP), CX // size 943 LEAQ ret+24(FP), DX 944 JMP runtime·aeshashbody(SB) 945 946 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 947 MOVQ p+0(FP), AX // ptr to string struct 948 MOVQ 8(AX), CX // length of string 949 MOVQ (AX), AX // string data 950 LEAQ ret+16(FP), DX 951 JMP runtime·aeshashbody(SB) 952 953 // AX: data 954 // CX: length 955 // DX: address to put return value 956 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 957 // Fill an SSE register with our seeds. 958 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 959 PINSRW $4, CX, X0 // 16 bits of length 960 PSHUFHW $0, X0, X0 // repeat length 4 times total 961 MOVO X0, X1 // save unscrambled seed 962 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 963 AESENC X0, X0 // scramble seed 964 965 CMPQ CX, $16 966 JB aes0to15 967 JE aes16 968 CMPQ CX, $32 969 JBE aes17to32 970 CMPQ CX, $64 971 JBE aes33to64 972 CMPQ CX, $128 973 JBE aes65to128 974 JMP aes129plus 975 976 aes0to15: 977 TESTQ CX, CX 978 JE aes0 979 980 ADDQ $16, AX 981 TESTW $0xff0, AX 982 JE endofpage 983 984 // 16 bytes loaded at this address won't cross 985 // a page boundary, so we can load it directly. 986 MOVOU -16(AX), X1 987 ADDQ CX, CX 988 MOVQ $masks<>(SB), AX 989 PAND (AX)(CX*8), X1 990 final1: 991 AESENC X0, X1 // scramble input, xor in seed 992 AESENC X1, X1 // scramble combo 2 times 993 AESENC X1, X1 994 MOVQ X1, (DX) 995 RET 996 997 endofpage: 998 // address ends in 1111xxxx. Might be up against 999 // a page boundary, so load ending at last byte. 1000 // Then shift bytes down using pshufb. 1001 MOVOU -32(AX)(CX*1), X1 1002 ADDQ CX, CX 1003 MOVQ $shifts<>(SB), AX 1004 PSHUFB (AX)(CX*8), X1 1005 JMP final1 1006 1007 aes0: 1008 // Return scrambled input seed 1009 AESENC X0, X0 1010 MOVQ X0, (DX) 1011 RET 1012 1013 aes16: 1014 MOVOU (AX), X1 1015 JMP final1 1016 1017 aes17to32: 1018 // make second starting seed 1019 PXOR runtime·aeskeysched+16(SB), X1 1020 AESENC X1, X1 1021 1022 // load data to be hashed 1023 MOVOU (AX), X2 1024 MOVOU -16(AX)(CX*1), X3 1025 1026 // scramble 3 times 1027 AESENC X0, X2 1028 AESENC X1, X3 1029 AESENC X2, X2 1030 AESENC X3, X3 1031 AESENC X2, X2 1032 AESENC X3, X3 1033 1034 // combine results 1035 PXOR X3, X2 1036 MOVQ X2, (DX) 1037 RET 1038 1039 aes33to64: 1040 // make 3 more starting seeds 1041 MOVO X1, X2 1042 MOVO X1, X3 1043 PXOR runtime·aeskeysched+16(SB), X1 1044 PXOR runtime·aeskeysched+32(SB), X2 1045 PXOR runtime·aeskeysched+48(SB), X3 1046 AESENC X1, X1 1047 AESENC X2, X2 1048 AESENC X3, X3 1049 1050 MOVOU (AX), X4 1051 MOVOU 16(AX), X5 1052 MOVOU -32(AX)(CX*1), X6 1053 MOVOU -16(AX)(CX*1), X7 1054 1055 AESENC X0, X4 1056 AESENC X1, X5 1057 AESENC X2, X6 1058 AESENC X3, X7 1059 1060 AESENC X4, X4 1061 AESENC X5, X5 1062 AESENC X6, X6 1063 AESENC X7, X7 1064 1065 AESENC X4, X4 1066 AESENC X5, X5 1067 AESENC X6, X6 1068 AESENC X7, X7 1069 1070 PXOR X6, X4 1071 PXOR X7, X5 1072 PXOR X5, X4 1073 MOVQ X4, (DX) 1074 RET 1075 1076 aes65to128: 1077 // make 7 more starting seeds 1078 MOVO X1, X2 1079 MOVO X1, X3 1080 MOVO X1, X4 1081 MOVO X1, X5 1082 MOVO X1, X6 1083 MOVO X1, X7 1084 PXOR runtime·aeskeysched+16(SB), X1 1085 PXOR runtime·aeskeysched+32(SB), X2 1086 PXOR runtime·aeskeysched+48(SB), X3 1087 PXOR runtime·aeskeysched+64(SB), X4 1088 PXOR runtime·aeskeysched+80(SB), X5 1089 PXOR runtime·aeskeysched+96(SB), X6 1090 PXOR runtime·aeskeysched+112(SB), X7 1091 AESENC X1, X1 1092 AESENC X2, X2 1093 AESENC X3, X3 1094 AESENC X4, X4 1095 AESENC X5, X5 1096 AESENC X6, X6 1097 AESENC X7, X7 1098 1099 // load data 1100 MOVOU (AX), X8 1101 MOVOU 16(AX), X9 1102 MOVOU 32(AX), X10 1103 MOVOU 48(AX), X11 1104 MOVOU -64(AX)(CX*1), X12 1105 MOVOU -48(AX)(CX*1), X13 1106 MOVOU -32(AX)(CX*1), X14 1107 MOVOU -16(AX)(CX*1), X15 1108 1109 // scramble data, xor in seed 1110 AESENC X0, X8 1111 AESENC X1, X9 1112 AESENC X2, X10 1113 AESENC X3, X11 1114 AESENC X4, X12 1115 AESENC X5, X13 1116 AESENC X6, X14 1117 AESENC X7, X15 1118 1119 // scramble twice 1120 AESENC X8, X8 1121 AESENC X9, X9 1122 AESENC X10, X10 1123 AESENC X11, X11 1124 AESENC X12, X12 1125 AESENC X13, X13 1126 AESENC X14, X14 1127 AESENC X15, X15 1128 1129 AESENC X8, X8 1130 AESENC X9, X9 1131 AESENC X10, X10 1132 AESENC X11, X11 1133 AESENC X12, X12 1134 AESENC X13, X13 1135 AESENC X14, X14 1136 AESENC X15, X15 1137 1138 // combine results 1139 PXOR X12, X8 1140 PXOR X13, X9 1141 PXOR X14, X10 1142 PXOR X15, X11 1143 PXOR X10, X8 1144 PXOR X11, X9 1145 PXOR X9, X8 1146 MOVQ X8, (DX) 1147 RET 1148 1149 aes129plus: 1150 // make 7 more starting seeds 1151 MOVO X1, X2 1152 MOVO X1, X3 1153 MOVO X1, X4 1154 MOVO X1, X5 1155 MOVO X1, X6 1156 MOVO X1, X7 1157 PXOR runtime·aeskeysched+16(SB), X1 1158 PXOR runtime·aeskeysched+32(SB), X2 1159 PXOR runtime·aeskeysched+48(SB), X3 1160 PXOR runtime·aeskeysched+64(SB), X4 1161 PXOR runtime·aeskeysched+80(SB), X5 1162 PXOR runtime·aeskeysched+96(SB), X6 1163 PXOR runtime·aeskeysched+112(SB), X7 1164 AESENC X1, X1 1165 AESENC X2, X2 1166 AESENC X3, X3 1167 AESENC X4, X4 1168 AESENC X5, X5 1169 AESENC X6, X6 1170 AESENC X7, X7 1171 1172 // start with last (possibly overlapping) block 1173 MOVOU -128(AX)(CX*1), X8 1174 MOVOU -112(AX)(CX*1), X9 1175 MOVOU -96(AX)(CX*1), X10 1176 MOVOU -80(AX)(CX*1), X11 1177 MOVOU -64(AX)(CX*1), X12 1178 MOVOU -48(AX)(CX*1), X13 1179 MOVOU -32(AX)(CX*1), X14 1180 MOVOU -16(AX)(CX*1), X15 1181 1182 // scramble input once, xor in seed 1183 AESENC X0, X8 1184 AESENC X1, X9 1185 AESENC X2, X10 1186 AESENC X3, X11 1187 AESENC X4, X12 1188 AESENC X5, X13 1189 AESENC X6, X14 1190 AESENC X7, X15 1191 1192 // compute number of remaining 128-byte blocks 1193 DECQ CX 1194 SHRQ $7, CX 1195 1196 aesloop: 1197 // scramble state, xor in a block 1198 MOVOU (AX), X0 1199 MOVOU 16(AX), X1 1200 MOVOU 32(AX), X2 1201 MOVOU 48(AX), X3 1202 AESENC X0, X8 1203 AESENC X1, X9 1204 AESENC X2, X10 1205 AESENC X3, X11 1206 MOVOU 64(AX), X4 1207 MOVOU 80(AX), X5 1208 MOVOU 96(AX), X6 1209 MOVOU 112(AX), X7 1210 AESENC X4, X12 1211 AESENC X5, X13 1212 AESENC X6, X14 1213 AESENC X7, X15 1214 1215 // scramble state 1216 AESENC X8, X8 1217 AESENC X9, X9 1218 AESENC X10, X10 1219 AESENC X11, X11 1220 AESENC X12, X12 1221 AESENC X13, X13 1222 AESENC X14, X14 1223 AESENC X15, X15 1224 1225 ADDQ $128, AX 1226 DECQ CX 1227 JNE aesloop 1228 1229 // 2 more scrambles to finish 1230 AESENC X8, X8 1231 AESENC X9, X9 1232 AESENC X10, X10 1233 AESENC X11, X11 1234 AESENC X12, X12 1235 AESENC X13, X13 1236 AESENC X14, X14 1237 AESENC X15, X15 1238 AESENC X8, X8 1239 AESENC X9, X9 1240 AESENC X10, X10 1241 AESENC X11, X11 1242 AESENC X12, X12 1243 AESENC X13, X13 1244 AESENC X14, X14 1245 AESENC X15, X15 1246 1247 PXOR X12, X8 1248 PXOR X13, X9 1249 PXOR X14, X10 1250 PXOR X15, X11 1251 PXOR X10, X8 1252 PXOR X11, X9 1253 PXOR X9, X8 1254 MOVQ X8, (DX) 1255 RET 1256 1257 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1258 MOVQ p+0(FP), AX // ptr to data 1259 MOVQ h+8(FP), X0 // seed 1260 PINSRD $2, (AX), X0 // data 1261 AESENC runtime·aeskeysched+0(SB), X0 1262 AESENC runtime·aeskeysched+16(SB), X0 1263 AESENC runtime·aeskeysched+32(SB), X0 1264 MOVQ X0, ret+16(FP) 1265 RET 1266 1267 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1268 MOVQ p+0(FP), AX // ptr to data 1269 MOVQ h+8(FP), X0 // seed 1270 PINSRQ $1, (AX), X0 // data 1271 AESENC runtime·aeskeysched+0(SB), X0 1272 AESENC runtime·aeskeysched+16(SB), X0 1273 AESENC runtime·aeskeysched+32(SB), X0 1274 MOVQ X0, ret+16(FP) 1275 RET 1276 1277 // simple mask to get rid of data in the high part of the register. 1278 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1279 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1280 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1281 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1282 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1283 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1284 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1285 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1286 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1287 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1288 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1289 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1290 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1291 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1292 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1293 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1294 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1295 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1296 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1297 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1298 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1299 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1300 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1301 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1302 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1303 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1304 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1305 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1306 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1307 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1308 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1309 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1310 GLOBL masks<>(SB),RODATA,$256 1311 1312 // these are arguments to pshufb. They move data down from 1313 // the high bytes of the register to the low bytes of the register. 1314 // index is how many bytes to move. 1315 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1316 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1317 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1318 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1319 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1320 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1321 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1322 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1323 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1324 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1325 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1326 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1327 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1328 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1329 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1330 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1331 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1332 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1333 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1334 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1335 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1336 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1337 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1338 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1339 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1340 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1341 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1342 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1343 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1344 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1345 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1346 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1347 GLOBL shifts<>(SB),RODATA,$256 1348 1349 TEXT runtime·memeq(SB),NOSPLIT,$0-25 1350 MOVQ a+0(FP), SI 1351 MOVQ b+8(FP), DI 1352 MOVQ size+16(FP), BX 1353 LEAQ ret+24(FP), AX 1354 JMP runtime·memeqbody(SB) 1355 1356 // memequal_varlen(a, b unsafe.Pointer) bool 1357 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1358 MOVQ a+0(FP), SI 1359 MOVQ b+8(FP), DI 1360 CMPQ SI, DI 1361 JEQ eq 1362 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1363 LEAQ ret+16(FP), AX 1364 JMP runtime·memeqbody(SB) 1365 eq: 1366 MOVB $1, ret+16(FP) 1367 RET 1368 1369 // eqstring tests whether two strings are equal. 1370 // The compiler guarantees that strings passed 1371 // to eqstring have equal length. 1372 // See runtime_test.go:eqstring_generic for 1373 // equivalent Go code. 1374 TEXT runtime·eqstring(SB),NOSPLIT,$0-33 1375 MOVQ s1str+0(FP), SI 1376 MOVQ s2str+16(FP), DI 1377 CMPQ SI, DI 1378 JEQ eq 1379 MOVQ s1len+8(FP), BX 1380 LEAQ v+32(FP), AX 1381 JMP runtime·memeqbody(SB) 1382 eq: 1383 MOVB $1, v+32(FP) 1384 RET 1385 1386 // a in SI 1387 // b in DI 1388 // count in BX 1389 // address of result byte in AX 1390 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1391 CMPQ BX, $8 1392 JB small 1393 1394 // 64 bytes at a time using xmm registers 1395 hugeloop: 1396 CMPQ BX, $64 1397 JB bigloop 1398 MOVOU (SI), X0 1399 MOVOU (DI), X1 1400 MOVOU 16(SI), X2 1401 MOVOU 16(DI), X3 1402 MOVOU 32(SI), X4 1403 MOVOU 32(DI), X5 1404 MOVOU 48(SI), X6 1405 MOVOU 48(DI), X7 1406 PCMPEQB X1, X0 1407 PCMPEQB X3, X2 1408 PCMPEQB X5, X4 1409 PCMPEQB X7, X6 1410 PAND X2, X0 1411 PAND X6, X4 1412 PAND X4, X0 1413 PMOVMSKB X0, DX 1414 ADDQ $64, SI 1415 ADDQ $64, DI 1416 SUBQ $64, BX 1417 CMPL DX, $0xffff 1418 JEQ hugeloop 1419 MOVB $0, (AX) 1420 RET 1421 1422 // 8 bytes at a time using 64-bit register 1423 bigloop: 1424 CMPQ BX, $8 1425 JBE leftover 1426 MOVQ (SI), CX 1427 MOVQ (DI), DX 1428 ADDQ $8, SI 1429 ADDQ $8, DI 1430 SUBQ $8, BX 1431 CMPQ CX, DX 1432 JEQ bigloop 1433 MOVB $0, (AX) 1434 RET 1435 1436 // remaining 0-8 bytes 1437 leftover: 1438 MOVQ -8(SI)(BX*1), CX 1439 MOVQ -8(DI)(BX*1), DX 1440 CMPQ CX, DX 1441 SETEQ (AX) 1442 RET 1443 1444 small: 1445 CMPQ BX, $0 1446 JEQ equal 1447 1448 LEAQ 0(BX*8), CX 1449 NEGQ CX 1450 1451 CMPB SI, $0xf8 1452 JA si_high 1453 1454 // load at SI won't cross a page boundary. 1455 MOVQ (SI), SI 1456 JMP si_finish 1457 si_high: 1458 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1459 MOVQ -8(SI)(BX*1), SI 1460 SHRQ CX, SI 1461 si_finish: 1462 1463 // same for DI. 1464 CMPB DI, $0xf8 1465 JA di_high 1466 MOVQ (DI), DI 1467 JMP di_finish 1468 di_high: 1469 MOVQ -8(DI)(BX*1), DI 1470 SHRQ CX, DI 1471 di_finish: 1472 1473 SUBQ SI, DI 1474 SHLQ CX, DI 1475 equal: 1476 SETEQ (AX) 1477 RET 1478 1479 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1480 MOVQ s1_base+0(FP), SI 1481 MOVQ s1_len+8(FP), BX 1482 MOVQ s2_base+16(FP), DI 1483 MOVQ s2_len+24(FP), DX 1484 LEAQ ret+32(FP), R9 1485 JMP runtime·cmpbody(SB) 1486 1487 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1488 MOVQ s1+0(FP), SI 1489 MOVQ s1+8(FP), BX 1490 MOVQ s2+24(FP), DI 1491 MOVQ s2+32(FP), DX 1492 LEAQ res+48(FP), R9 1493 JMP runtime·cmpbody(SB) 1494 1495 // input: 1496 // SI = a 1497 // DI = b 1498 // BX = alen 1499 // DX = blen 1500 // R9 = address of output word (stores -1/0/1 here) 1501 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1502 CMPQ SI, DI 1503 JEQ allsame 1504 CMPQ BX, DX 1505 MOVQ DX, R8 1506 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1507 CMPQ R8, $8 1508 JB small 1509 1510 CMPQ R8, $63 1511 JA big_loop 1512 loop: 1513 CMPQ R8, $16 1514 JBE _0through16 1515 MOVOU (SI), X0 1516 MOVOU (DI), X1 1517 PCMPEQB X0, X1 1518 PMOVMSKB X1, AX 1519 XORQ $0xffff, AX // convert EQ to NE 1520 JNE diff16 // branch if at least one byte is not equal 1521 ADDQ $16, SI 1522 ADDQ $16, DI 1523 SUBQ $16, R8 1524 JMP loop 1525 1526 diff64: 1527 ADDQ $48, SI 1528 ADDQ $48, DI 1529 JMP diff16 1530 diff48: 1531 ADDQ $32, SI 1532 ADDQ $32, DI 1533 JMP diff16 1534 diff32: 1535 ADDQ $16, SI 1536 ADDQ $16, DI 1537 // AX = bit mask of differences 1538 diff16: 1539 BSFQ AX, BX // index of first byte that differs 1540 XORQ AX, AX 1541 MOVB (SI)(BX*1), CX 1542 CMPB CX, (DI)(BX*1) 1543 SETHI AX 1544 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1545 MOVQ AX, (R9) 1546 RET 1547 1548 // 0 through 16 bytes left, alen>=8, blen>=8 1549 _0through16: 1550 CMPQ R8, $8 1551 JBE _0through8 1552 MOVQ (SI), AX 1553 MOVQ (DI), CX 1554 CMPQ AX, CX 1555 JNE diff8 1556 _0through8: 1557 MOVQ -8(SI)(R8*1), AX 1558 MOVQ -8(DI)(R8*1), CX 1559 CMPQ AX, CX 1560 JEQ allsame 1561 1562 // AX and CX contain parts of a and b that differ. 1563 diff8: 1564 BSWAPQ AX // reverse order of bytes 1565 BSWAPQ CX 1566 XORQ AX, CX 1567 BSRQ CX, CX // index of highest bit difference 1568 SHRQ CX, AX // move a's bit to bottom 1569 ANDQ $1, AX // mask bit 1570 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1571 MOVQ AX, (R9) 1572 RET 1573 1574 // 0-7 bytes in common 1575 small: 1576 LEAQ (R8*8), CX // bytes left -> bits left 1577 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1578 JEQ allsame 1579 1580 // load bytes of a into high bytes of AX 1581 CMPB SI, $0xf8 1582 JA si_high 1583 MOVQ (SI), SI 1584 JMP si_finish 1585 si_high: 1586 MOVQ -8(SI)(R8*1), SI 1587 SHRQ CX, SI 1588 si_finish: 1589 SHLQ CX, SI 1590 1591 // load bytes of b in to high bytes of BX 1592 CMPB DI, $0xf8 1593 JA di_high 1594 MOVQ (DI), DI 1595 JMP di_finish 1596 di_high: 1597 MOVQ -8(DI)(R8*1), DI 1598 SHRQ CX, DI 1599 di_finish: 1600 SHLQ CX, DI 1601 1602 BSWAPQ SI // reverse order of bytes 1603 BSWAPQ DI 1604 XORQ SI, DI // find bit differences 1605 JEQ allsame 1606 BSRQ DI, CX // index of highest bit difference 1607 SHRQ CX, SI // move a's bit to bottom 1608 ANDQ $1, SI // mask bit 1609 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1610 MOVQ AX, (R9) 1611 RET 1612 1613 allsame: 1614 XORQ AX, AX 1615 XORQ CX, CX 1616 CMPQ BX, DX 1617 SETGT AX // 1 if alen > blen 1618 SETEQ CX // 1 if alen == blen 1619 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1620 MOVQ AX, (R9) 1621 RET 1622 1623 // this works for >= 64 bytes of data. 1624 big_loop: 1625 MOVOU (SI), X0 1626 MOVOU (DI), X1 1627 PCMPEQB X0, X1 1628 PMOVMSKB X1, AX 1629 XORQ $0xffff, AX 1630 JNE diff16 1631 1632 MOVOU 16(SI), X0 1633 MOVOU 16(DI), X1 1634 PCMPEQB X0, X1 1635 PMOVMSKB X1, AX 1636 XORQ $0xffff, AX 1637 JNE diff32 1638 1639 MOVOU 32(SI), X0 1640 MOVOU 32(DI), X1 1641 PCMPEQB X0, X1 1642 PMOVMSKB X1, AX 1643 XORQ $0xffff, AX 1644 JNE diff48 1645 1646 MOVOU 48(SI), X0 1647 MOVOU 48(DI), X1 1648 PCMPEQB X0, X1 1649 PMOVMSKB X1, AX 1650 XORQ $0xffff, AX 1651 JNE diff64 1652 1653 ADDQ $64, SI 1654 ADDQ $64, DI 1655 SUBQ $64, R8 1656 CMPQ R8, $64 1657 JBE loop 1658 JMP big_loop 1659 1660 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 1661 MOVQ s+0(FP), SI 1662 MOVQ s_len+8(FP), BX 1663 MOVB c+24(FP), AL 1664 LEAQ ret+32(FP), R8 1665 JMP runtime·indexbytebody(SB) 1666 1667 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 1668 MOVQ s+0(FP), SI 1669 MOVQ s_len+8(FP), BX 1670 MOVB c+16(FP), AL 1671 LEAQ ret+24(FP), R8 1672 JMP runtime·indexbytebody(SB) 1673 1674 // input: 1675 // SI: data 1676 // BX: data len 1677 // AL: byte sought 1678 // R8: address to put result 1679 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 1680 MOVQ SI, DI 1681 1682 CMPQ BX, $16 1683 JLT small 1684 1685 // round up to first 16-byte boundary 1686 TESTQ $15, SI 1687 JZ aligned 1688 MOVQ SI, CX 1689 ANDQ $~15, CX 1690 ADDQ $16, CX 1691 1692 // search the beginning 1693 SUBQ SI, CX 1694 REPN; SCASB 1695 JZ success 1696 1697 // DI is 16-byte aligned; get ready to search using SSE instructions 1698 aligned: 1699 // round down to last 16-byte boundary 1700 MOVQ BX, R11 1701 ADDQ SI, R11 1702 ANDQ $~15, R11 1703 1704 // shuffle X0 around so that each byte contains c 1705 MOVD AX, X0 1706 PUNPCKLBW X0, X0 1707 PUNPCKLBW X0, X0 1708 PSHUFL $0, X0, X0 1709 JMP condition 1710 1711 sse: 1712 // move the next 16-byte chunk of the buffer into X1 1713 MOVO (DI), X1 1714 // compare bytes in X0 to X1 1715 PCMPEQB X0, X1 1716 // take the top bit of each byte in X1 and put the result in DX 1717 PMOVMSKB X1, DX 1718 TESTL DX, DX 1719 JNZ ssesuccess 1720 ADDQ $16, DI 1721 1722 condition: 1723 CMPQ DI, R11 1724 JLT sse 1725 1726 // search the end 1727 MOVQ SI, CX 1728 ADDQ BX, CX 1729 SUBQ R11, CX 1730 // if CX == 0, the zero flag will be set and we'll end up 1731 // returning a false success 1732 JZ failure 1733 REPN; SCASB 1734 JZ success 1735 1736 failure: 1737 MOVQ $-1, (R8) 1738 RET 1739 1740 // handle for lengths < 16 1741 small: 1742 MOVQ BX, CX 1743 REPN; SCASB 1744 JZ success 1745 MOVQ $-1, (R8) 1746 RET 1747 1748 // we've found the chunk containing the byte 1749 // now just figure out which specific byte it is 1750 ssesuccess: 1751 // get the index of the least significant set bit 1752 BSFW DX, DX 1753 SUBQ SI, DI 1754 ADDQ DI, DX 1755 MOVQ DX, (R8) 1756 RET 1757 1758 success: 1759 SUBQ SI, DI 1760 SUBL $1, DI 1761 MOVQ DI, (R8) 1762 RET 1763 1764 TEXT bytes·Equal(SB),NOSPLIT,$0-49 1765 MOVQ a_len+8(FP), BX 1766 MOVQ b_len+32(FP), CX 1767 CMPQ BX, CX 1768 JNE eqret 1769 MOVQ a+0(FP), SI 1770 MOVQ b+24(FP), DI 1771 LEAQ ret+48(FP), AX 1772 JMP runtime·memeqbody(SB) 1773 eqret: 1774 MOVB $0, ret+48(FP) 1775 RET 1776 1777 TEXT runtime·fastrand1(SB), NOSPLIT, $0-4 1778 get_tls(CX) 1779 MOVQ g(CX), AX 1780 MOVQ g_m(AX), AX 1781 MOVL m_fastrand(AX), DX 1782 ADDL DX, DX 1783 MOVL DX, BX 1784 XORL $0x88888eef, DX 1785 CMOVLMI BX, DX 1786 MOVL DX, m_fastrand(AX) 1787 MOVL DX, ret+0(FP) 1788 RET 1789 1790 TEXT runtime·return0(SB), NOSPLIT, $0 1791 MOVL $0, AX 1792 RET 1793 1794 1795 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 1796 // Must obey the gcc calling convention. 1797 TEXT _cgo_topofstack(SB),NOSPLIT,$0 1798 get_tls(CX) 1799 MOVQ g(CX), AX 1800 MOVQ g_m(AX), AX 1801 MOVQ m_curg(AX), AX 1802 MOVQ (g_stack+stack_hi)(AX), AX 1803 RET 1804 1805 // The top-most function running on a goroutine 1806 // returns to goexit+PCQuantum. 1807 TEXT runtime·goexit(SB),NOSPLIT,$0-0 1808 BYTE $0x90 // NOP 1809 CALL runtime·goexit1(SB) // does not return 1810 // traceback from goexit1 must hit code range of goexit 1811 BYTE $0x90 // NOP 1812 1813 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8 1814 MOVQ addr+0(FP), AX 1815 PREFETCHT0 (AX) 1816 RET 1817 1818 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8 1819 MOVQ addr+0(FP), AX 1820 PREFETCHT1 (AX) 1821 RET 1822 1823 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8 1824 MOVQ addr+0(FP), AX 1825 PREFETCHT2 (AX) 1826 RET 1827 1828 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8 1829 MOVQ addr+0(FP), AX 1830 PREFETCHNTA (AX) 1831 RET 1832 1833 // This is called from .init_array and follows the platform, not Go, ABI. 1834 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 1835 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 1836 MOVQ runtime·lastmoduledatap(SB), AX 1837 MOVQ DI, moduledata_next(AX) 1838 MOVQ DI, runtime·lastmoduledatap(SB) 1839 POPQ R15 1840 RET