github.com/fjballest/golang@v0.0.0-20151209143359-e4c5fe594ca8/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVQ $0, AX 30 CPUID 31 CMPQ AX, $0 32 JE nocpuinfo 33 34 // Figure out how to serialize RDTSC. 35 // On Intel processors LFENCE is enough. AMD requires MFENCE. 36 // Don't know about the rest, so let's do MFENCE. 37 CMPL BX, $0x756E6547 // "Genu" 38 JNE notintel 39 CMPL DX, $0x49656E69 // "ineI" 40 JNE notintel 41 CMPL CX, $0x6C65746E // "ntel" 42 JNE notintel 43 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 44 notintel: 45 // Do nothing. 46 47 MOVQ $1, AX 48 CPUID 49 MOVL CX, runtime·cpuid_ecx(SB) 50 MOVL DX, runtime·cpuid_edx(SB) 51 // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1] 52 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 53 // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf 54 ANDL $0x18000000, CX // check for OSXSAVE and AVX bits 55 CMPL CX, $0x18000000 56 JNE noavx 57 MOVL $0, CX 58 // For XGETBV, OSXSAVE bit is required and sufficient 59 BYTE $0x0F; BYTE $0x01; BYTE $0xD0 60 ANDL $6, AX 61 CMPL AX, $6 // Check for OS support of YMM registers 62 JNE noavx 63 MOVB $1, runtime·support_avx(SB) 64 MOVL $7, AX 65 MOVL $0, CX 66 CPUID 67 ANDL $0x20, BX // check for AVX2 bit 68 CMPL BX, $0x20 69 JNE noavx2 70 MOVB $1, runtime·support_avx2(SB) 71 JMP nocpuinfo 72 noavx: 73 MOVB $0, runtime·support_avx(SB) 74 noavx2: 75 MOVB $0, runtime·support_avx2(SB) 76 nocpuinfo: 77 78 // if there is an _cgo_init, call it. 79 MOVQ _cgo_init(SB), AX 80 TESTQ AX, AX 81 JZ needtls 82 // g0 already in DI 83 MOVQ DI, CX // Win64 uses CX for first parameter 84 MOVQ $setg_gcc<>(SB), SI 85 CALL AX 86 87 // update stackguard after _cgo_init 88 MOVQ $runtime·g0(SB), CX 89 MOVQ (g_stack+stack_lo)(CX), AX 90 ADDQ $const__StackGuard, AX 91 MOVQ AX, g_stackguard0(CX) 92 MOVQ AX, g_stackguard1(CX) 93 94 #ifndef GOOS_windows 95 JMP ok 96 #endif 97 needtls: 98 #ifdef GOOS_plan9 99 // skip TLS setup on Plan 9 100 JMP ok 101 #endif 102 #ifdef GOOS_solaris 103 // skip TLS setup on Solaris 104 JMP ok 105 #endif 106 107 LEAQ runtime·m0+m_tls(SB), DI 108 CALL runtime·settls(SB) 109 110 // store through it, to make sure it works 111 get_tls(BX) 112 MOVQ $0x123, g(BX) 113 MOVQ runtime·m0+m_tls(SB), AX 114 CMPQ AX, $0x123 115 JEQ 2(PC) 116 MOVL AX, 0 // abort 117 ok: 118 // set the per-goroutine and per-mach "registers" 119 get_tls(BX) 120 LEAQ runtime·g0(SB), CX 121 MOVQ CX, g(BX) 122 LEAQ runtime·m0(SB), AX 123 124 // save m->g0 = g0 125 MOVQ CX, m_g0(AX) 126 // save m0 to g0->m 127 MOVQ AX, g_m(CX) 128 129 CLD // convention is D is always left cleared 130 CALL runtime·check(SB) 131 132 MOVL 16(SP), AX // copy argc 133 MOVL AX, 0(SP) 134 MOVQ 24(SP), AX // copy argv 135 MOVQ AX, 8(SP) 136 CALL runtime·args(SB) 137 CALL runtime·osinit(SB) 138 CALL runtime·schedinit(SB) 139 140 // create a new goroutine to start program 141 MOVQ $runtime·mainPC(SB), AX // entry 142 PUSHQ AX 143 PUSHQ $0 // arg size 144 CALL runtime·newproc(SB) 145 POPQ AX 146 POPQ AX 147 148 // start this M 149 CALL runtime·mstart(SB) 150 151 MOVL $0xf1, 0xf1 // crash 152 RET 153 154 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 155 GLOBL runtime·mainPC(SB),RODATA,$8 156 157 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 158 BYTE $0xcc 159 RET 160 161 TEXT runtime·asminit(SB),NOSPLIT,$0-0 162 // No per-thread init. 163 RET 164 165 /* 166 * go-routine 167 */ 168 169 // void gosave(Gobuf*) 170 // save state in Gobuf; setjmp 171 TEXT runtime·gosave(SB), NOSPLIT, $0-8 172 MOVQ buf+0(FP), AX // gobuf 173 LEAQ buf+0(FP), BX // caller's SP 174 MOVQ BX, gobuf_sp(AX) 175 MOVQ 0(SP), BX // caller's PC 176 MOVQ BX, gobuf_pc(AX) 177 MOVQ $0, gobuf_ret(AX) 178 MOVQ $0, gobuf_ctxt(AX) 179 MOVQ BP, gobuf_bp(AX) 180 get_tls(CX) 181 MOVQ g(CX), BX 182 MOVQ BX, gobuf_g(AX) 183 RET 184 185 // void gogo(Gobuf*) 186 // restore state from Gobuf; longjmp 187 TEXT runtime·gogo(SB), NOSPLIT, $0-8 188 MOVQ buf+0(FP), BX // gobuf 189 MOVQ gobuf_g(BX), DX 190 MOVQ 0(DX), CX // make sure g != nil 191 get_tls(CX) 192 MOVQ DX, g(CX) 193 MOVQ gobuf_sp(BX), SP // restore SP 194 MOVQ gobuf_ret(BX), AX 195 MOVQ gobuf_ctxt(BX), DX 196 MOVQ gobuf_bp(BX), BP 197 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 198 MOVQ $0, gobuf_ret(BX) 199 MOVQ $0, gobuf_ctxt(BX) 200 MOVQ $0, gobuf_bp(BX) 201 MOVQ gobuf_pc(BX), BX 202 JMP BX 203 204 // func mcall(fn func(*g)) 205 // Switch to m->g0's stack, call fn(g). 206 // Fn must never return. It should gogo(&g->sched) 207 // to keep running g. 208 TEXT runtime·mcall(SB), NOSPLIT, $0-8 209 MOVQ fn+0(FP), DI 210 211 get_tls(CX) 212 MOVQ g(CX), AX // save state in g->sched 213 MOVQ 0(SP), BX // caller's PC 214 MOVQ BX, (g_sched+gobuf_pc)(AX) 215 LEAQ fn+0(FP), BX // caller's SP 216 MOVQ BX, (g_sched+gobuf_sp)(AX) 217 MOVQ AX, (g_sched+gobuf_g)(AX) 218 MOVQ BP, (g_sched+gobuf_bp)(AX) 219 220 // switch to m->g0 & its stack, call fn 221 MOVQ g(CX), BX 222 MOVQ g_m(BX), BX 223 MOVQ m_g0(BX), SI 224 CMPQ SI, AX // if g == m->g0 call badmcall 225 JNE 3(PC) 226 MOVQ $runtime·badmcall(SB), AX 227 JMP AX 228 MOVQ SI, g(CX) // g = m->g0 229 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 230 PUSHQ AX 231 MOVQ DI, DX 232 MOVQ 0(DI), DI 233 CALL DI 234 POPQ AX 235 MOVQ $runtime·badmcall2(SB), AX 236 JMP AX 237 RET 238 239 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 240 // of the G stack. We need to distinguish the routine that 241 // lives at the bottom of the G stack from the one that lives 242 // at the top of the system stack because the one at the top of 243 // the system stack terminates the stack walk (see topofstack()). 244 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 245 RET 246 247 // func systemstack(fn func()) 248 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 249 MOVQ fn+0(FP), DI // DI = fn 250 get_tls(CX) 251 MOVQ g(CX), AX // AX = g 252 MOVQ g_m(AX), BX // BX = m 253 254 MOVQ m_gsignal(BX), DX // DX = gsignal 255 CMPQ AX, DX 256 JEQ noswitch 257 258 MOVQ m_g0(BX), DX // DX = g0 259 CMPQ AX, DX 260 JEQ noswitch 261 262 MOVQ m_curg(BX), R8 263 CMPQ AX, R8 264 JEQ switch 265 266 // Bad: g is not gsignal, not g0, not curg. What is it? 267 MOVQ $runtime·badsystemstack(SB), AX 268 CALL AX 269 270 switch: 271 // save our state in g->sched. Pretend to 272 // be systemstack_switch if the G stack is scanned. 273 MOVQ $runtime·systemstack_switch(SB), SI 274 MOVQ SI, (g_sched+gobuf_pc)(AX) 275 MOVQ SP, (g_sched+gobuf_sp)(AX) 276 MOVQ AX, (g_sched+gobuf_g)(AX) 277 MOVQ BP, (g_sched+gobuf_bp)(AX) 278 279 // switch to g0 280 MOVQ DX, g(CX) 281 MOVQ (g_sched+gobuf_sp)(DX), BX 282 // make it look like mstart called systemstack on g0, to stop traceback 283 SUBQ $8, BX 284 MOVQ $runtime·mstart(SB), DX 285 MOVQ DX, 0(BX) 286 MOVQ BX, SP 287 288 // call target function 289 MOVQ DI, DX 290 MOVQ 0(DI), DI 291 CALL DI 292 293 // switch back to g 294 get_tls(CX) 295 MOVQ g(CX), AX 296 MOVQ g_m(AX), BX 297 MOVQ m_curg(BX), AX 298 MOVQ AX, g(CX) 299 MOVQ (g_sched+gobuf_sp)(AX), SP 300 MOVQ $0, (g_sched+gobuf_sp)(AX) 301 RET 302 303 noswitch: 304 // already on m stack, just call directly 305 MOVQ DI, DX 306 MOVQ 0(DI), DI 307 CALL DI 308 RET 309 310 /* 311 * support for morestack 312 */ 313 314 // Called during function prolog when more stack is needed. 315 // 316 // The traceback routines see morestack on a g0 as being 317 // the top of a stack (for example, morestack calling newstack 318 // calling the scheduler calling newm calling gc), so we must 319 // record an argument size. For that purpose, it has no arguments. 320 TEXT runtime·morestack(SB),NOSPLIT,$0-0 321 // Cannot grow scheduler stack (m->g0). 322 get_tls(CX) 323 MOVQ g(CX), BX 324 MOVQ g_m(BX), BX 325 MOVQ m_g0(BX), SI 326 CMPQ g(CX), SI 327 JNE 2(PC) 328 INT $3 329 330 // Cannot grow signal stack (m->gsignal). 331 MOVQ m_gsignal(BX), SI 332 CMPQ g(CX), SI 333 JNE 2(PC) 334 INT $3 335 336 // Called from f. 337 // Set m->morebuf to f's caller. 338 MOVQ 8(SP), AX // f's caller's PC 339 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 340 LEAQ 16(SP), AX // f's caller's SP 341 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 342 get_tls(CX) 343 MOVQ g(CX), SI 344 MOVQ SI, (m_morebuf+gobuf_g)(BX) 345 346 // Set g->sched to context in f. 347 MOVQ 0(SP), AX // f's PC 348 MOVQ AX, (g_sched+gobuf_pc)(SI) 349 MOVQ SI, (g_sched+gobuf_g)(SI) 350 LEAQ 8(SP), AX // f's SP 351 MOVQ AX, (g_sched+gobuf_sp)(SI) 352 MOVQ DX, (g_sched+gobuf_ctxt)(SI) 353 MOVQ BP, (g_sched+gobuf_bp)(SI) 354 355 // Call newstack on m->g0's stack. 356 MOVQ m_g0(BX), BX 357 MOVQ BX, g(CX) 358 MOVQ (g_sched+gobuf_sp)(BX), SP 359 CALL runtime·newstack(SB) 360 MOVQ $0, 0x1003 // crash if newstack returns 361 RET 362 363 // morestack but not preserving ctxt. 364 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 365 MOVL $0, DX 366 JMP runtime·morestack(SB) 367 368 TEXT runtime·stackBarrier(SB),NOSPLIT,$0 369 // We came here via a RET to an overwritten return PC. 370 // AX may be live. Other registers are available. 371 372 // Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal. 373 get_tls(CX) 374 MOVQ g(CX), CX 375 MOVQ (g_stkbar+slice_array)(CX), DX 376 MOVQ g_stkbarPos(CX), BX 377 IMULQ $stkbar__size, BX // Too big for SIB. 378 MOVQ stkbar_savedLRPtr(DX)(BX*1), R8 379 MOVQ stkbar_savedLRVal(DX)(BX*1), BX 380 // Assert that we're popping the right saved LR. 381 ADDQ $8, R8 382 CMPQ R8, SP 383 JEQ 2(PC) 384 MOVL $0, 0 385 // Record that this stack barrier was hit. 386 ADDQ $1, g_stkbarPos(CX) 387 // Jump to the original return PC. 388 JMP BX 389 390 // reflectcall: call a function with the given argument list 391 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 392 // we don't have variable-sized frames, so we use a small number 393 // of constant-sized-frame functions to encode a few bits of size in the pc. 394 // Caution: ugly multiline assembly macros in your future! 395 396 #define DISPATCH(NAME,MAXSIZE) \ 397 CMPQ CX, $MAXSIZE; \ 398 JA 3(PC); \ 399 MOVQ $NAME(SB), AX; \ 400 JMP AX 401 // Note: can't just "JMP NAME(SB)" - bad inlining results. 402 403 TEXT reflect·call(SB), NOSPLIT, $0-0 404 JMP ·reflectcall(SB) 405 406 TEXT ·reflectcall(SB), NOSPLIT, $0-32 407 MOVLQZX argsize+24(FP), CX 408 // NOTE(rsc): No call16, because CALLFN needs four words 409 // of argument space to invoke callwritebarrier. 410 DISPATCH(runtime·call32, 32) 411 DISPATCH(runtime·call64, 64) 412 DISPATCH(runtime·call128, 128) 413 DISPATCH(runtime·call256, 256) 414 DISPATCH(runtime·call512, 512) 415 DISPATCH(runtime·call1024, 1024) 416 DISPATCH(runtime·call2048, 2048) 417 DISPATCH(runtime·call4096, 4096) 418 DISPATCH(runtime·call8192, 8192) 419 DISPATCH(runtime·call16384, 16384) 420 DISPATCH(runtime·call32768, 32768) 421 DISPATCH(runtime·call65536, 65536) 422 DISPATCH(runtime·call131072, 131072) 423 DISPATCH(runtime·call262144, 262144) 424 DISPATCH(runtime·call524288, 524288) 425 DISPATCH(runtime·call1048576, 1048576) 426 DISPATCH(runtime·call2097152, 2097152) 427 DISPATCH(runtime·call4194304, 4194304) 428 DISPATCH(runtime·call8388608, 8388608) 429 DISPATCH(runtime·call16777216, 16777216) 430 DISPATCH(runtime·call33554432, 33554432) 431 DISPATCH(runtime·call67108864, 67108864) 432 DISPATCH(runtime·call134217728, 134217728) 433 DISPATCH(runtime·call268435456, 268435456) 434 DISPATCH(runtime·call536870912, 536870912) 435 DISPATCH(runtime·call1073741824, 1073741824) 436 MOVQ $runtime·badreflectcall(SB), AX 437 JMP AX 438 439 #define CALLFN(NAME,MAXSIZE) \ 440 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 441 NO_LOCAL_POINTERS; \ 442 /* copy arguments to stack */ \ 443 MOVQ argptr+16(FP), SI; \ 444 MOVLQZX argsize+24(FP), CX; \ 445 MOVQ SP, DI; \ 446 REP;MOVSB; \ 447 /* call function */ \ 448 MOVQ f+8(FP), DX; \ 449 PCDATA $PCDATA_StackMapIndex, $0; \ 450 CALL (DX); \ 451 /* copy return values back */ \ 452 MOVQ argptr+16(FP), DI; \ 453 MOVLQZX argsize+24(FP), CX; \ 454 MOVLQZX retoffset+28(FP), BX; \ 455 MOVQ SP, SI; \ 456 ADDQ BX, DI; \ 457 ADDQ BX, SI; \ 458 SUBQ BX, CX; \ 459 REP;MOVSB; \ 460 /* execute write barrier updates */ \ 461 MOVQ argtype+0(FP), DX; \ 462 MOVQ argptr+16(FP), DI; \ 463 MOVLQZX argsize+24(FP), CX; \ 464 MOVLQZX retoffset+28(FP), BX; \ 465 MOVQ DX, 0(SP); \ 466 MOVQ DI, 8(SP); \ 467 MOVQ CX, 16(SP); \ 468 MOVQ BX, 24(SP); \ 469 CALL runtime·callwritebarrier(SB); \ 470 RET 471 472 CALLFN(·call32, 32) 473 CALLFN(·call64, 64) 474 CALLFN(·call128, 128) 475 CALLFN(·call256, 256) 476 CALLFN(·call512, 512) 477 CALLFN(·call1024, 1024) 478 CALLFN(·call2048, 2048) 479 CALLFN(·call4096, 4096) 480 CALLFN(·call8192, 8192) 481 CALLFN(·call16384, 16384) 482 CALLFN(·call32768, 32768) 483 CALLFN(·call65536, 65536) 484 CALLFN(·call131072, 131072) 485 CALLFN(·call262144, 262144) 486 CALLFN(·call524288, 524288) 487 CALLFN(·call1048576, 1048576) 488 CALLFN(·call2097152, 2097152) 489 CALLFN(·call4194304, 4194304) 490 CALLFN(·call8388608, 8388608) 491 CALLFN(·call16777216, 16777216) 492 CALLFN(·call33554432, 33554432) 493 CALLFN(·call67108864, 67108864) 494 CALLFN(·call134217728, 134217728) 495 CALLFN(·call268435456, 268435456) 496 CALLFN(·call536870912, 536870912) 497 CALLFN(·call1073741824, 1073741824) 498 499 TEXT runtime·procyield(SB),NOSPLIT,$0-0 500 MOVL cycles+0(FP), AX 501 again: 502 PAUSE 503 SUBL $1, AX 504 JNZ again 505 RET 506 507 508 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 509 // Stores are already ordered on x86, so this is just a 510 // compile barrier. 511 RET 512 513 // void jmpdefer(fn, sp); 514 // called from deferreturn. 515 // 1. pop the caller 516 // 2. sub 5 bytes from the callers return 517 // 3. jmp to the argument 518 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 519 MOVQ fv+0(FP), DX // fn 520 MOVQ argp+8(FP), BX // caller sp 521 LEAQ -8(BX), SP // caller sp after CALL 522 SUBQ $5, (SP) // return to CALL again 523 MOVQ 0(DX), BX 524 JMP BX // but first run the deferred function 525 526 // Save state of caller into g->sched. Smashes R8, R9. 527 TEXT gosave<>(SB),NOSPLIT,$0 528 get_tls(R8) 529 MOVQ g(R8), R8 530 MOVQ 0(SP), R9 531 MOVQ R9, (g_sched+gobuf_pc)(R8) 532 LEAQ 8(SP), R9 533 MOVQ R9, (g_sched+gobuf_sp)(R8) 534 MOVQ $0, (g_sched+gobuf_ret)(R8) 535 MOVQ $0, (g_sched+gobuf_ctxt)(R8) 536 MOVQ BP, (g_sched+gobuf_bp)(R8) 537 RET 538 539 // func asmcgocall(fn, arg unsafe.Pointer) int32 540 // Call fn(arg) on the scheduler stack, 541 // aligned appropriately for the gcc ABI. 542 // See cgocall.go for more details. 543 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 544 MOVQ fn+0(FP), AX 545 MOVQ arg+8(FP), BX 546 547 MOVQ SP, DX 548 549 // Figure out if we need to switch to m->g0 stack. 550 // We get called to create new OS threads too, and those 551 // come in on the m->g0 stack already. 552 get_tls(CX) 553 MOVQ g(CX), R8 554 CMPQ R8, $0 555 JEQ nosave 556 MOVQ g_m(R8), R8 557 MOVQ m_g0(R8), SI 558 MOVQ g(CX), DI 559 CMPQ SI, DI 560 JEQ nosave 561 MOVQ m_gsignal(R8), SI 562 CMPQ SI, DI 563 JEQ nosave 564 565 // Switch to system stack. 566 MOVQ m_g0(R8), SI 567 CALL gosave<>(SB) 568 MOVQ SI, g(CX) 569 MOVQ (g_sched+gobuf_sp)(SI), SP 570 571 // Now on a scheduling stack (a pthread-created stack). 572 // Make sure we have enough room for 4 stack-backed fast-call 573 // registers as per windows amd64 calling convention. 574 SUBQ $64, SP 575 ANDQ $~15, SP // alignment for gcc ABI 576 MOVQ DI, 48(SP) // save g 577 MOVQ (g_stack+stack_hi)(DI), DI 578 SUBQ DX, DI 579 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 580 MOVQ BX, DI // DI = first argument in AMD64 ABI 581 MOVQ BX, CX // CX = first argument in Win64 582 CALL AX 583 584 // Restore registers, g, stack pointer. 585 get_tls(CX) 586 MOVQ 48(SP), DI 587 MOVQ (g_stack+stack_hi)(DI), SI 588 SUBQ 40(SP), SI 589 MOVQ DI, g(CX) 590 MOVQ SI, SP 591 592 MOVL AX, ret+16(FP) 593 RET 594 595 nosave: 596 // Running on a system stack, perhaps even without a g. 597 // Having no g can happen during thread creation or thread teardown 598 // (see needm/dropm on Solaris, for example). 599 // This code is like the above sequence but without saving/restoring g 600 // and without worrying about the stack moving out from under us 601 // (because we're on a system stack, not a goroutine stack). 602 // The above code could be used directly if already on a system stack, 603 // but then the only path through this code would be a rare case on Solaris. 604 // Using this code for all "already on system stack" calls exercises it more, 605 // which should help keep it correct. 606 SUBQ $64, SP 607 ANDQ $~15, SP 608 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging 609 MOVQ DX, 40(SP) // save original stack pointer 610 MOVQ BX, DI // DI = first argument in AMD64 ABI 611 MOVQ BX, CX // CX = first argument in Win64 612 CALL AX 613 MOVQ 40(SP), SI // restore original stack pointer 614 MOVQ SI, SP 615 MOVL AX, ret+16(FP) 616 RET 617 618 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize) 619 // Turn the fn into a Go func (by taking its address) and call 620 // cgocallback_gofunc. 621 TEXT runtime·cgocallback(SB),NOSPLIT,$24-24 622 LEAQ fn+0(FP), AX 623 MOVQ AX, 0(SP) 624 MOVQ frame+8(FP), AX 625 MOVQ AX, 8(SP) 626 MOVQ framesize+16(FP), AX 627 MOVQ AX, 16(SP) 628 MOVQ $runtime·cgocallback_gofunc(SB), AX 629 CALL AX 630 RET 631 632 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) 633 // See cgocall.go for more details. 634 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24 635 NO_LOCAL_POINTERS 636 637 // If g is nil, Go did not create the current thread. 638 // Call needm to obtain one m for temporary use. 639 // In this case, we're running on the thread stack, so there's 640 // lots of space, but the linker doesn't know. Hide the call from 641 // the linker analysis by using an indirect call through AX. 642 get_tls(CX) 643 #ifdef GOOS_windows 644 MOVL $0, BX 645 CMPQ CX, $0 646 JEQ 2(PC) 647 #endif 648 MOVQ g(CX), BX 649 CMPQ BX, $0 650 JEQ needm 651 MOVQ g_m(BX), BX 652 MOVQ BX, R8 // holds oldm until end of function 653 JMP havem 654 needm: 655 MOVQ $0, 0(SP) 656 MOVQ $runtime·needm(SB), AX 657 CALL AX 658 MOVQ 0(SP), R8 659 get_tls(CX) 660 MOVQ g(CX), BX 661 MOVQ g_m(BX), BX 662 663 // Set m->sched.sp = SP, so that if a panic happens 664 // during the function we are about to execute, it will 665 // have a valid SP to run on the g0 stack. 666 // The next few lines (after the havem label) 667 // will save this SP onto the stack and then write 668 // the same SP back to m->sched.sp. That seems redundant, 669 // but if an unrecovered panic happens, unwindm will 670 // restore the g->sched.sp from the stack location 671 // and then systemstack will try to use it. If we don't set it here, 672 // that restored SP will be uninitialized (typically 0) and 673 // will not be usable. 674 MOVQ m_g0(BX), SI 675 MOVQ SP, (g_sched+gobuf_sp)(SI) 676 677 havem: 678 // Now there's a valid m, and we're running on its m->g0. 679 // Save current m->g0->sched.sp on stack and then set it to SP. 680 // Save current sp in m->g0->sched.sp in preparation for 681 // switch back to m->curg stack. 682 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 683 MOVQ m_g0(BX), SI 684 MOVQ (g_sched+gobuf_sp)(SI), AX 685 MOVQ AX, 0(SP) 686 MOVQ SP, (g_sched+gobuf_sp)(SI) 687 688 // Switch to m->curg stack and call runtime.cgocallbackg. 689 // Because we are taking over the execution of m->curg 690 // but *not* resuming what had been running, we need to 691 // save that information (m->curg->sched) so we can restore it. 692 // We can restore m->curg->sched.sp easily, because calling 693 // runtime.cgocallbackg leaves SP unchanged upon return. 694 // To save m->curg->sched.pc, we push it onto the stack. 695 // This has the added benefit that it looks to the traceback 696 // routine like cgocallbackg is going to return to that 697 // PC (because the frame we allocate below has the same 698 // size as cgocallback_gofunc's frame declared above) 699 // so that the traceback will seamlessly trace back into 700 // the earlier calls. 701 // 702 // In the new goroutine, 0(SP) holds the saved R8. 703 MOVQ m_curg(BX), SI 704 MOVQ SI, g(CX) 705 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 706 MOVQ (g_sched+gobuf_pc)(SI), BX 707 MOVQ BX, -8(DI) 708 // Compute the size of the frame, including return PC and, if 709 // GOEXPERIMENT=framepointer, the saved based pointer 710 LEAQ fv+0(FP), AX 711 SUBQ SP, AX 712 SUBQ AX, DI 713 MOVQ DI, SP 714 715 MOVQ R8, 0(SP) 716 CALL runtime·cgocallbackg(SB) 717 MOVQ 0(SP), R8 718 719 // Compute the size of the frame again. FP and SP have 720 // completely different values here than they did above, 721 // but only their difference matters. 722 LEAQ fv+0(FP), AX 723 SUBQ SP, AX 724 725 // Restore g->sched (== m->curg->sched) from saved values. 726 get_tls(CX) 727 MOVQ g(CX), SI 728 MOVQ SP, DI 729 ADDQ AX, DI 730 MOVQ -8(DI), BX 731 MOVQ BX, (g_sched+gobuf_pc)(SI) 732 MOVQ DI, (g_sched+gobuf_sp)(SI) 733 734 // Switch back to m->g0's stack and restore m->g0->sched.sp. 735 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 736 // so we do not have to restore it.) 737 MOVQ g(CX), BX 738 MOVQ g_m(BX), BX 739 MOVQ m_g0(BX), SI 740 MOVQ SI, g(CX) 741 MOVQ (g_sched+gobuf_sp)(SI), SP 742 MOVQ 0(SP), AX 743 MOVQ AX, (g_sched+gobuf_sp)(SI) 744 745 // If the m on entry was nil, we called needm above to borrow an m 746 // for the duration of the call. Since the call is over, return it with dropm. 747 CMPQ R8, $0 748 JNE 3(PC) 749 MOVQ $runtime·dropm(SB), AX 750 CALL AX 751 752 // Done! 753 RET 754 755 // void setg(G*); set g. for use by needm. 756 TEXT runtime·setg(SB), NOSPLIT, $0-8 757 MOVQ gg+0(FP), BX 758 #ifdef GOOS_windows 759 CMPQ BX, $0 760 JNE settls 761 MOVQ $0, 0x28(GS) 762 RET 763 settls: 764 MOVQ g_m(BX), AX 765 LEAQ m_tls(AX), AX 766 MOVQ AX, 0x28(GS) 767 #endif 768 get_tls(CX) 769 MOVQ BX, g(CX) 770 RET 771 772 // void setg_gcc(G*); set g called from gcc. 773 TEXT setg_gcc<>(SB),NOSPLIT,$0 774 get_tls(AX) 775 MOVQ DI, g(AX) 776 RET 777 778 // check that SP is in range [g->stack.lo, g->stack.hi) 779 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 780 get_tls(CX) 781 MOVQ g(CX), AX 782 CMPQ (g_stack+stack_hi)(AX), SP 783 JHI 2(PC) 784 INT $3 785 CMPQ SP, (g_stack+stack_lo)(AX) 786 JHI 2(PC) 787 INT $3 788 RET 789 790 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 791 MOVQ argp+0(FP),AX // addr of first arg 792 MOVQ -8(AX),AX // get calling pc 793 CMPQ AX, runtime·stackBarrierPC(SB) 794 JNE nobar 795 // Get original return PC. 796 CALL runtime·nextBarrierPC(SB) 797 MOVQ 0(SP), AX 798 nobar: 799 MOVQ AX, ret+8(FP) 800 RET 801 802 TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16 803 MOVQ argp+0(FP),AX // addr of first arg 804 MOVQ pc+8(FP), BX 805 MOVQ -8(AX), CX 806 CMPQ CX, runtime·stackBarrierPC(SB) 807 JEQ setbar 808 MOVQ BX, -8(AX) // set calling pc 809 RET 810 setbar: 811 // Set the stack barrier return PC. 812 MOVQ BX, 0(SP) 813 CALL runtime·setNextBarrierPC(SB) 814 RET 815 816 TEXT runtime·getcallersp(SB),NOSPLIT,$0-16 817 MOVQ argp+0(FP), AX 818 MOVQ AX, ret+8(FP) 819 RET 820 821 // func cputicks() int64 822 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 823 CMPB runtime·lfenceBeforeRdtsc(SB), $1 824 JNE mfence 825 BYTE $0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE 826 JMP done 827 mfence: 828 BYTE $0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE 829 done: 830 RDTSC 831 SHLQ $32, DX 832 ADDQ DX, AX 833 MOVQ AX, ret+0(FP) 834 RET 835 836 // memhash_varlen(p unsafe.Pointer, h seed) uintptr 837 // redirects to memhash(p, h, size) using the size 838 // stored in the closure. 839 TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24 840 GO_ARGS 841 NO_LOCAL_POINTERS 842 MOVQ p+0(FP), AX 843 MOVQ h+8(FP), BX 844 MOVQ 8(DX), CX 845 MOVQ AX, 0(SP) 846 MOVQ BX, 8(SP) 847 MOVQ CX, 16(SP) 848 CALL runtime·memhash(SB) 849 MOVQ 24(SP), AX 850 MOVQ AX, ret+16(FP) 851 RET 852 853 // hash function using AES hardware instructions 854 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 855 MOVQ p+0(FP), AX // ptr to data 856 MOVQ s+16(FP), CX // size 857 LEAQ ret+24(FP), DX 858 JMP runtime·aeshashbody(SB) 859 860 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 861 MOVQ p+0(FP), AX // ptr to string struct 862 MOVQ 8(AX), CX // length of string 863 MOVQ (AX), AX // string data 864 LEAQ ret+16(FP), DX 865 JMP runtime·aeshashbody(SB) 866 867 // AX: data 868 // CX: length 869 // DX: address to put return value 870 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 871 // Fill an SSE register with our seeds. 872 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 873 PINSRW $4, CX, X0 // 16 bits of length 874 PSHUFHW $0, X0, X0 // repeat length 4 times total 875 MOVO X0, X1 // save unscrambled seed 876 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 877 AESENC X0, X0 // scramble seed 878 879 CMPQ CX, $16 880 JB aes0to15 881 JE aes16 882 CMPQ CX, $32 883 JBE aes17to32 884 CMPQ CX, $64 885 JBE aes33to64 886 CMPQ CX, $128 887 JBE aes65to128 888 JMP aes129plus 889 890 aes0to15: 891 TESTQ CX, CX 892 JE aes0 893 894 ADDQ $16, AX 895 TESTW $0xff0, AX 896 JE endofpage 897 898 // 16 bytes loaded at this address won't cross 899 // a page boundary, so we can load it directly. 900 MOVOU -16(AX), X1 901 ADDQ CX, CX 902 MOVQ $masks<>(SB), AX 903 PAND (AX)(CX*8), X1 904 final1: 905 AESENC X0, X1 // scramble input, xor in seed 906 AESENC X1, X1 // scramble combo 2 times 907 AESENC X1, X1 908 MOVQ X1, (DX) 909 RET 910 911 endofpage: 912 // address ends in 1111xxxx. Might be up against 913 // a page boundary, so load ending at last byte. 914 // Then shift bytes down using pshufb. 915 MOVOU -32(AX)(CX*1), X1 916 ADDQ CX, CX 917 MOVQ $shifts<>(SB), AX 918 PSHUFB (AX)(CX*8), X1 919 JMP final1 920 921 aes0: 922 // Return scrambled input seed 923 AESENC X0, X0 924 MOVQ X0, (DX) 925 RET 926 927 aes16: 928 MOVOU (AX), X1 929 JMP final1 930 931 aes17to32: 932 // make second starting seed 933 PXOR runtime·aeskeysched+16(SB), X1 934 AESENC X1, X1 935 936 // load data to be hashed 937 MOVOU (AX), X2 938 MOVOU -16(AX)(CX*1), X3 939 940 // scramble 3 times 941 AESENC X0, X2 942 AESENC X1, X3 943 AESENC X2, X2 944 AESENC X3, X3 945 AESENC X2, X2 946 AESENC X3, X3 947 948 // combine results 949 PXOR X3, X2 950 MOVQ X2, (DX) 951 RET 952 953 aes33to64: 954 // make 3 more starting seeds 955 MOVO X1, X2 956 MOVO X1, X3 957 PXOR runtime·aeskeysched+16(SB), X1 958 PXOR runtime·aeskeysched+32(SB), X2 959 PXOR runtime·aeskeysched+48(SB), X3 960 AESENC X1, X1 961 AESENC X2, X2 962 AESENC X3, X3 963 964 MOVOU (AX), X4 965 MOVOU 16(AX), X5 966 MOVOU -32(AX)(CX*1), X6 967 MOVOU -16(AX)(CX*1), X7 968 969 AESENC X0, X4 970 AESENC X1, X5 971 AESENC X2, X6 972 AESENC X3, X7 973 974 AESENC X4, X4 975 AESENC X5, X5 976 AESENC X6, X6 977 AESENC X7, X7 978 979 AESENC X4, X4 980 AESENC X5, X5 981 AESENC X6, X6 982 AESENC X7, X7 983 984 PXOR X6, X4 985 PXOR X7, X5 986 PXOR X5, X4 987 MOVQ X4, (DX) 988 RET 989 990 aes65to128: 991 // make 7 more starting seeds 992 MOVO X1, X2 993 MOVO X1, X3 994 MOVO X1, X4 995 MOVO X1, X5 996 MOVO X1, X6 997 MOVO X1, X7 998 PXOR runtime·aeskeysched+16(SB), X1 999 PXOR runtime·aeskeysched+32(SB), X2 1000 PXOR runtime·aeskeysched+48(SB), X3 1001 PXOR runtime·aeskeysched+64(SB), X4 1002 PXOR runtime·aeskeysched+80(SB), X5 1003 PXOR runtime·aeskeysched+96(SB), X6 1004 PXOR runtime·aeskeysched+112(SB), X7 1005 AESENC X1, X1 1006 AESENC X2, X2 1007 AESENC X3, X3 1008 AESENC X4, X4 1009 AESENC X5, X5 1010 AESENC X6, X6 1011 AESENC X7, X7 1012 1013 // load data 1014 MOVOU (AX), X8 1015 MOVOU 16(AX), X9 1016 MOVOU 32(AX), X10 1017 MOVOU 48(AX), X11 1018 MOVOU -64(AX)(CX*1), X12 1019 MOVOU -48(AX)(CX*1), X13 1020 MOVOU -32(AX)(CX*1), X14 1021 MOVOU -16(AX)(CX*1), X15 1022 1023 // scramble data, xor in seed 1024 AESENC X0, X8 1025 AESENC X1, X9 1026 AESENC X2, X10 1027 AESENC X3, X11 1028 AESENC X4, X12 1029 AESENC X5, X13 1030 AESENC X6, X14 1031 AESENC X7, X15 1032 1033 // scramble twice 1034 AESENC X8, X8 1035 AESENC X9, X9 1036 AESENC X10, X10 1037 AESENC X11, X11 1038 AESENC X12, X12 1039 AESENC X13, X13 1040 AESENC X14, X14 1041 AESENC X15, X15 1042 1043 AESENC X8, X8 1044 AESENC X9, X9 1045 AESENC X10, X10 1046 AESENC X11, X11 1047 AESENC X12, X12 1048 AESENC X13, X13 1049 AESENC X14, X14 1050 AESENC X15, X15 1051 1052 // combine results 1053 PXOR X12, X8 1054 PXOR X13, X9 1055 PXOR X14, X10 1056 PXOR X15, X11 1057 PXOR X10, X8 1058 PXOR X11, X9 1059 PXOR X9, X8 1060 MOVQ X8, (DX) 1061 RET 1062 1063 aes129plus: 1064 // make 7 more starting seeds 1065 MOVO X1, X2 1066 MOVO X1, X3 1067 MOVO X1, X4 1068 MOVO X1, X5 1069 MOVO X1, X6 1070 MOVO X1, X7 1071 PXOR runtime·aeskeysched+16(SB), X1 1072 PXOR runtime·aeskeysched+32(SB), X2 1073 PXOR runtime·aeskeysched+48(SB), X3 1074 PXOR runtime·aeskeysched+64(SB), X4 1075 PXOR runtime·aeskeysched+80(SB), X5 1076 PXOR runtime·aeskeysched+96(SB), X6 1077 PXOR runtime·aeskeysched+112(SB), X7 1078 AESENC X1, X1 1079 AESENC X2, X2 1080 AESENC X3, X3 1081 AESENC X4, X4 1082 AESENC X5, X5 1083 AESENC X6, X6 1084 AESENC X7, X7 1085 1086 // start with last (possibly overlapping) block 1087 MOVOU -128(AX)(CX*1), X8 1088 MOVOU -112(AX)(CX*1), X9 1089 MOVOU -96(AX)(CX*1), X10 1090 MOVOU -80(AX)(CX*1), X11 1091 MOVOU -64(AX)(CX*1), X12 1092 MOVOU -48(AX)(CX*1), X13 1093 MOVOU -32(AX)(CX*1), X14 1094 MOVOU -16(AX)(CX*1), X15 1095 1096 // scramble input once, xor in seed 1097 AESENC X0, X8 1098 AESENC X1, X9 1099 AESENC X2, X10 1100 AESENC X3, X11 1101 AESENC X4, X12 1102 AESENC X5, X13 1103 AESENC X6, X14 1104 AESENC X7, X15 1105 1106 // compute number of remaining 128-byte blocks 1107 DECQ CX 1108 SHRQ $7, CX 1109 1110 aesloop: 1111 // scramble state, xor in a block 1112 MOVOU (AX), X0 1113 MOVOU 16(AX), X1 1114 MOVOU 32(AX), X2 1115 MOVOU 48(AX), X3 1116 AESENC X0, X8 1117 AESENC X1, X9 1118 AESENC X2, X10 1119 AESENC X3, X11 1120 MOVOU 64(AX), X4 1121 MOVOU 80(AX), X5 1122 MOVOU 96(AX), X6 1123 MOVOU 112(AX), X7 1124 AESENC X4, X12 1125 AESENC X5, X13 1126 AESENC X6, X14 1127 AESENC X7, X15 1128 1129 // scramble state 1130 AESENC X8, X8 1131 AESENC X9, X9 1132 AESENC X10, X10 1133 AESENC X11, X11 1134 AESENC X12, X12 1135 AESENC X13, X13 1136 AESENC X14, X14 1137 AESENC X15, X15 1138 1139 ADDQ $128, AX 1140 DECQ CX 1141 JNE aesloop 1142 1143 // 2 more scrambles to finish 1144 AESENC X8, X8 1145 AESENC X9, X9 1146 AESENC X10, X10 1147 AESENC X11, X11 1148 AESENC X12, X12 1149 AESENC X13, X13 1150 AESENC X14, X14 1151 AESENC X15, X15 1152 AESENC X8, X8 1153 AESENC X9, X9 1154 AESENC X10, X10 1155 AESENC X11, X11 1156 AESENC X12, X12 1157 AESENC X13, X13 1158 AESENC X14, X14 1159 AESENC X15, X15 1160 1161 PXOR X12, X8 1162 PXOR X13, X9 1163 PXOR X14, X10 1164 PXOR X15, X11 1165 PXOR X10, X8 1166 PXOR X11, X9 1167 PXOR X9, X8 1168 MOVQ X8, (DX) 1169 RET 1170 1171 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1172 MOVQ p+0(FP), AX // ptr to data 1173 MOVQ h+8(FP), X0 // seed 1174 PINSRD $2, (AX), X0 // data 1175 AESENC runtime·aeskeysched+0(SB), X0 1176 AESENC runtime·aeskeysched+16(SB), X0 1177 AESENC runtime·aeskeysched+32(SB), X0 1178 MOVQ X0, ret+16(FP) 1179 RET 1180 1181 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1182 MOVQ p+0(FP), AX // ptr to data 1183 MOVQ h+8(FP), X0 // seed 1184 PINSRQ $1, (AX), X0 // data 1185 AESENC runtime·aeskeysched+0(SB), X0 1186 AESENC runtime·aeskeysched+16(SB), X0 1187 AESENC runtime·aeskeysched+32(SB), X0 1188 MOVQ X0, ret+16(FP) 1189 RET 1190 1191 // simple mask to get rid of data in the high part of the register. 1192 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1193 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1194 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1195 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1196 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1197 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1198 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1199 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1200 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1201 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1202 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1203 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1204 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1205 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1206 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1207 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1208 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1209 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1210 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1211 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1212 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1213 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1214 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1215 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1216 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1217 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1218 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1219 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1220 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1221 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1222 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1223 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1224 GLOBL masks<>(SB),RODATA,$256 1225 1226 TEXT ·checkASM(SB),NOSPLIT,$0-1 1227 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte 1228 MOVQ $masks<>(SB), AX 1229 MOVQ $shifts<>(SB), BX 1230 ORQ BX, AX 1231 TESTQ $15, AX 1232 SETEQ ret+0(FP) 1233 RET 1234 1235 // these are arguments to pshufb. They move data down from 1236 // the high bytes of the register to the low bytes of the register. 1237 // index is how many bytes to move. 1238 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1239 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1240 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1241 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1242 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1243 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1244 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1245 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1246 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1247 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1248 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1249 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1250 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1251 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1252 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1253 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1254 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1255 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1256 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1257 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1258 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1259 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1260 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1261 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1262 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1263 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1264 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1265 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1266 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1267 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1268 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1269 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1270 GLOBL shifts<>(SB),RODATA,$256 1271 1272 TEXT runtime·memeq(SB),NOSPLIT,$0-25 1273 MOVQ a+0(FP), SI 1274 MOVQ b+8(FP), DI 1275 MOVQ size+16(FP), BX 1276 LEAQ ret+24(FP), AX 1277 JMP runtime·memeqbody(SB) 1278 1279 // memequal_varlen(a, b unsafe.Pointer) bool 1280 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1281 MOVQ a+0(FP), SI 1282 MOVQ b+8(FP), DI 1283 CMPQ SI, DI 1284 JEQ eq 1285 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1286 LEAQ ret+16(FP), AX 1287 JMP runtime·memeqbody(SB) 1288 eq: 1289 MOVB $1, ret+16(FP) 1290 RET 1291 1292 // eqstring tests whether two strings are equal. 1293 // The compiler guarantees that strings passed 1294 // to eqstring have equal length. 1295 // See runtime_test.go:eqstring_generic for 1296 // equivalent Go code. 1297 TEXT runtime·eqstring(SB),NOSPLIT,$0-33 1298 MOVQ s1str+0(FP), SI 1299 MOVQ s2str+16(FP), DI 1300 CMPQ SI, DI 1301 JEQ eq 1302 MOVQ s1len+8(FP), BX 1303 LEAQ v+32(FP), AX 1304 JMP runtime·memeqbody(SB) 1305 eq: 1306 MOVB $1, v+32(FP) 1307 RET 1308 1309 // a in SI 1310 // b in DI 1311 // count in BX 1312 // address of result byte in AX 1313 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1314 CMPQ BX, $8 1315 JB small 1316 CMPQ BX, $64 1317 JB bigloop 1318 CMPB runtime·support_avx2(SB), $1 1319 JE hugeloop_avx2 1320 1321 // 64 bytes at a time using xmm registers 1322 hugeloop: 1323 CMPQ BX, $64 1324 JB bigloop 1325 MOVOU (SI), X0 1326 MOVOU (DI), X1 1327 MOVOU 16(SI), X2 1328 MOVOU 16(DI), X3 1329 MOVOU 32(SI), X4 1330 MOVOU 32(DI), X5 1331 MOVOU 48(SI), X6 1332 MOVOU 48(DI), X7 1333 PCMPEQB X1, X0 1334 PCMPEQB X3, X2 1335 PCMPEQB X5, X4 1336 PCMPEQB X7, X6 1337 PAND X2, X0 1338 PAND X6, X4 1339 PAND X4, X0 1340 PMOVMSKB X0, DX 1341 ADDQ $64, SI 1342 ADDQ $64, DI 1343 SUBQ $64, BX 1344 CMPL DX, $0xffff 1345 JEQ hugeloop 1346 MOVB $0, (AX) 1347 RET 1348 1349 // 64 bytes at a time using ymm registers 1350 hugeloop_avx2: 1351 CMPQ BX, $64 1352 JB bigloop_avx2 1353 MOVHDU (SI), X0 1354 MOVHDU (DI), X1 1355 MOVHDU 32(SI), X2 1356 MOVHDU 32(DI), X3 1357 VPCMPEQB X1, X0, X4 1358 VPCMPEQB X2, X3, X5 1359 VPAND X4, X5, X6 1360 VPMOVMSKB X6, DX 1361 ADDQ $64, SI 1362 ADDQ $64, DI 1363 SUBQ $64, BX 1364 CMPL DX, $0xffffffff 1365 JEQ hugeloop_avx2 1366 VZEROUPPER 1367 MOVB $0, (AX) 1368 RET 1369 1370 bigloop_avx2: 1371 VZEROUPPER 1372 1373 // 8 bytes at a time using 64-bit register 1374 bigloop: 1375 CMPQ BX, $8 1376 JBE leftover 1377 MOVQ (SI), CX 1378 MOVQ (DI), DX 1379 ADDQ $8, SI 1380 ADDQ $8, DI 1381 SUBQ $8, BX 1382 CMPQ CX, DX 1383 JEQ bigloop 1384 MOVB $0, (AX) 1385 RET 1386 1387 // remaining 0-8 bytes 1388 leftover: 1389 MOVQ -8(SI)(BX*1), CX 1390 MOVQ -8(DI)(BX*1), DX 1391 CMPQ CX, DX 1392 SETEQ (AX) 1393 RET 1394 1395 small: 1396 CMPQ BX, $0 1397 JEQ equal 1398 1399 LEAQ 0(BX*8), CX 1400 NEGQ CX 1401 1402 CMPB SI, $0xf8 1403 JA si_high 1404 1405 // load at SI won't cross a page boundary. 1406 MOVQ (SI), SI 1407 JMP si_finish 1408 si_high: 1409 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1410 MOVQ -8(SI)(BX*1), SI 1411 SHRQ CX, SI 1412 si_finish: 1413 1414 // same for DI. 1415 CMPB DI, $0xf8 1416 JA di_high 1417 MOVQ (DI), DI 1418 JMP di_finish 1419 di_high: 1420 MOVQ -8(DI)(BX*1), DI 1421 SHRQ CX, DI 1422 di_finish: 1423 1424 SUBQ SI, DI 1425 SHLQ CX, DI 1426 equal: 1427 SETEQ (AX) 1428 RET 1429 1430 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1431 MOVQ s1_base+0(FP), SI 1432 MOVQ s1_len+8(FP), BX 1433 MOVQ s2_base+16(FP), DI 1434 MOVQ s2_len+24(FP), DX 1435 LEAQ ret+32(FP), R9 1436 JMP runtime·cmpbody(SB) 1437 1438 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1439 MOVQ s1+0(FP), SI 1440 MOVQ s1+8(FP), BX 1441 MOVQ s2+24(FP), DI 1442 MOVQ s2+32(FP), DX 1443 LEAQ res+48(FP), R9 1444 JMP runtime·cmpbody(SB) 1445 1446 // input: 1447 // SI = a 1448 // DI = b 1449 // BX = alen 1450 // DX = blen 1451 // R9 = address of output word (stores -1/0/1 here) 1452 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1453 CMPQ SI, DI 1454 JEQ allsame 1455 CMPQ BX, DX 1456 MOVQ DX, R8 1457 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1458 CMPQ R8, $8 1459 JB small 1460 1461 CMPQ R8, $63 1462 JBE loop 1463 CMPB runtime·support_avx2(SB), $1 1464 JEQ big_loop_avx2 1465 JMP big_loop 1466 loop: 1467 CMPQ R8, $16 1468 JBE _0through16 1469 MOVOU (SI), X0 1470 MOVOU (DI), X1 1471 PCMPEQB X0, X1 1472 PMOVMSKB X1, AX 1473 XORQ $0xffff, AX // convert EQ to NE 1474 JNE diff16 // branch if at least one byte is not equal 1475 ADDQ $16, SI 1476 ADDQ $16, DI 1477 SUBQ $16, R8 1478 JMP loop 1479 1480 diff64: 1481 ADDQ $48, SI 1482 ADDQ $48, DI 1483 JMP diff16 1484 diff48: 1485 ADDQ $32, SI 1486 ADDQ $32, DI 1487 JMP diff16 1488 diff32: 1489 ADDQ $16, SI 1490 ADDQ $16, DI 1491 // AX = bit mask of differences 1492 diff16: 1493 BSFQ AX, BX // index of first byte that differs 1494 XORQ AX, AX 1495 MOVB (SI)(BX*1), CX 1496 CMPB CX, (DI)(BX*1) 1497 SETHI AX 1498 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1499 MOVQ AX, (R9) 1500 RET 1501 1502 // 0 through 16 bytes left, alen>=8, blen>=8 1503 _0through16: 1504 CMPQ R8, $8 1505 JBE _0through8 1506 MOVQ (SI), AX 1507 MOVQ (DI), CX 1508 CMPQ AX, CX 1509 JNE diff8 1510 _0through8: 1511 MOVQ -8(SI)(R8*1), AX 1512 MOVQ -8(DI)(R8*1), CX 1513 CMPQ AX, CX 1514 JEQ allsame 1515 1516 // AX and CX contain parts of a and b that differ. 1517 diff8: 1518 BSWAPQ AX // reverse order of bytes 1519 BSWAPQ CX 1520 XORQ AX, CX 1521 BSRQ CX, CX // index of highest bit difference 1522 SHRQ CX, AX // move a's bit to bottom 1523 ANDQ $1, AX // mask bit 1524 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1525 MOVQ AX, (R9) 1526 RET 1527 1528 // 0-7 bytes in common 1529 small: 1530 LEAQ (R8*8), CX // bytes left -> bits left 1531 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1532 JEQ allsame 1533 1534 // load bytes of a into high bytes of AX 1535 CMPB SI, $0xf8 1536 JA si_high 1537 MOVQ (SI), SI 1538 JMP si_finish 1539 si_high: 1540 MOVQ -8(SI)(R8*1), SI 1541 SHRQ CX, SI 1542 si_finish: 1543 SHLQ CX, SI 1544 1545 // load bytes of b in to high bytes of BX 1546 CMPB DI, $0xf8 1547 JA di_high 1548 MOVQ (DI), DI 1549 JMP di_finish 1550 di_high: 1551 MOVQ -8(DI)(R8*1), DI 1552 SHRQ CX, DI 1553 di_finish: 1554 SHLQ CX, DI 1555 1556 BSWAPQ SI // reverse order of bytes 1557 BSWAPQ DI 1558 XORQ SI, DI // find bit differences 1559 JEQ allsame 1560 BSRQ DI, CX // index of highest bit difference 1561 SHRQ CX, SI // move a's bit to bottom 1562 ANDQ $1, SI // mask bit 1563 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1564 MOVQ AX, (R9) 1565 RET 1566 1567 allsame: 1568 XORQ AX, AX 1569 XORQ CX, CX 1570 CMPQ BX, DX 1571 SETGT AX // 1 if alen > blen 1572 SETEQ CX // 1 if alen == blen 1573 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1574 MOVQ AX, (R9) 1575 RET 1576 1577 // this works for >= 64 bytes of data. 1578 big_loop: 1579 MOVOU (SI), X0 1580 MOVOU (DI), X1 1581 PCMPEQB X0, X1 1582 PMOVMSKB X1, AX 1583 XORQ $0xffff, AX 1584 JNE diff16 1585 1586 MOVOU 16(SI), X0 1587 MOVOU 16(DI), X1 1588 PCMPEQB X0, X1 1589 PMOVMSKB X1, AX 1590 XORQ $0xffff, AX 1591 JNE diff32 1592 1593 MOVOU 32(SI), X0 1594 MOVOU 32(DI), X1 1595 PCMPEQB X0, X1 1596 PMOVMSKB X1, AX 1597 XORQ $0xffff, AX 1598 JNE diff48 1599 1600 MOVOU 48(SI), X0 1601 MOVOU 48(DI), X1 1602 PCMPEQB X0, X1 1603 PMOVMSKB X1, AX 1604 XORQ $0xffff, AX 1605 JNE diff64 1606 1607 ADDQ $64, SI 1608 ADDQ $64, DI 1609 SUBQ $64, R8 1610 CMPQ R8, $64 1611 JBE loop 1612 JMP big_loop 1613 1614 // Compare 64-bytes per loop iteration. 1615 // Loop is unrolled and uses AVX2. 1616 big_loop_avx2: 1617 MOVHDU (SI), X2 1618 MOVHDU (DI), X3 1619 MOVHDU 32(SI), X4 1620 MOVHDU 32(DI), X5 1621 VPCMPEQB X2, X3, X0 1622 VPMOVMSKB X0, AX 1623 XORL $0xffffffff, AX 1624 JNE diff32_avx2 1625 VPCMPEQB X4, X5, X6 1626 VPMOVMSKB X6, AX 1627 XORL $0xffffffff, AX 1628 JNE diff64_avx2 1629 1630 ADDQ $64, SI 1631 ADDQ $64, DI 1632 SUBQ $64, R8 1633 CMPQ R8, $64 1634 JB big_loop_avx2_exit 1635 JMP big_loop_avx2 1636 1637 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 1638 diff32_avx2: 1639 VZEROUPPER 1640 JMP diff16 1641 1642 // Same as diff32_avx2, but for last 32 bytes. 1643 diff64_avx2: 1644 VZEROUPPER 1645 JMP diff48 1646 1647 // For <64 bytes remainder jump to normal loop. 1648 big_loop_avx2_exit: 1649 VZEROUPPER 1650 JMP loop 1651 1652 1653 // TODO: Also use this in bytes.Index 1654 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 1655 MOVQ s+0(FP), DI 1656 MOVQ s_len+8(FP), CX 1657 MOVQ c+16(FP), AX 1658 MOVQ c_len+24(FP), BX 1659 CMPQ BX, CX 1660 JA fail 1661 CMPQ BX, $2 1662 JA _3_or_more 1663 MOVW (AX), AX 1664 LEAQ -1(DI)(CX*1), CX 1665 loop2: 1666 MOVW (DI), SI 1667 CMPW SI,AX 1668 JZ success 1669 ADDQ $1,DI 1670 CMPQ DI,CX 1671 JB loop2 1672 JMP fail 1673 _3_or_more: 1674 CMPQ BX, $3 1675 JA _4_or_more 1676 MOVW 1(AX), DX 1677 MOVW (AX), AX 1678 LEAQ -2(DI)(CX*1), CX 1679 loop3: 1680 MOVW (DI), SI 1681 CMPW SI,AX 1682 JZ partial_success3 1683 ADDQ $1,DI 1684 CMPQ DI,CX 1685 JB loop3 1686 JMP fail 1687 partial_success3: 1688 MOVW 1(DI), SI 1689 CMPW SI,DX 1690 JZ success 1691 ADDQ $1,DI 1692 CMPQ DI,CX 1693 JB loop3 1694 JMP fail 1695 _4_or_more: 1696 CMPQ BX, $4 1697 JA _5_or_more 1698 MOVL (AX), AX 1699 LEAQ -3(DI)(CX*1), CX 1700 loop4: 1701 MOVL (DI), SI 1702 CMPL SI,AX 1703 JZ success 1704 ADDQ $1,DI 1705 CMPQ DI,CX 1706 JB loop4 1707 JMP fail 1708 _5_or_more: 1709 CMPQ BX, $7 1710 JA _8_or_more 1711 LEAQ 1(DI)(CX*1), CX 1712 SUBQ BX, CX 1713 MOVL -4(AX)(BX*1), DX 1714 MOVL (AX), AX 1715 loop5to7: 1716 MOVL (DI), SI 1717 CMPL SI,AX 1718 JZ partial_success5to7 1719 ADDQ $1,DI 1720 CMPQ DI,CX 1721 JB loop5to7 1722 JMP fail 1723 partial_success5to7: 1724 MOVL -4(BX)(DI*1), SI 1725 CMPL SI,DX 1726 JZ success 1727 ADDQ $1,DI 1728 CMPQ DI,CX 1729 JB loop5to7 1730 JMP fail 1731 _8_or_more: 1732 CMPQ BX, $8 1733 JA _9_or_more 1734 MOVQ (AX), AX 1735 LEAQ -7(DI)(CX*1), CX 1736 loop8: 1737 MOVQ (DI), SI 1738 CMPQ SI,AX 1739 JZ success 1740 ADDQ $1,DI 1741 CMPQ DI,CX 1742 JB loop8 1743 JMP fail 1744 _9_or_more: 1745 CMPQ BX, $16 1746 JA _16_or_more 1747 LEAQ 1(DI)(CX*1), CX 1748 SUBQ BX, CX 1749 MOVQ -8(AX)(BX*1), DX 1750 MOVQ (AX), AX 1751 loop9to15: 1752 MOVQ (DI), SI 1753 CMPQ SI,AX 1754 JZ partial_success9to15 1755 ADDQ $1,DI 1756 CMPQ DI,CX 1757 JB loop9to15 1758 JMP fail 1759 partial_success9to15: 1760 MOVQ -8(BX)(DI*1), SI 1761 CMPQ SI,DX 1762 JZ success 1763 ADDQ $1,DI 1764 CMPQ DI,CX 1765 JB loop9to15 1766 JMP fail 1767 _16_or_more: 1768 CMPQ BX, $16 1769 JA _17_to_31 1770 MOVOU (AX), X1 1771 LEAQ -15(DI)(CX*1), CX 1772 loop16: 1773 MOVOU (DI), X2 1774 PCMPEQB X1, X2 1775 PMOVMSKB X2, SI 1776 CMPQ SI, $0xffff 1777 JE success 1778 ADDQ $1,DI 1779 CMPQ DI,CX 1780 JB loop16 1781 JMP fail 1782 _17_to_31: 1783 LEAQ 1(DI)(CX*1), CX 1784 SUBQ BX, CX 1785 MOVOU -16(AX)(BX*1), X0 1786 MOVOU (AX), X1 1787 loop17to31: 1788 MOVOU (DI), X2 1789 PCMPEQB X1,X2 1790 PMOVMSKB X2, SI 1791 CMPQ SI, $0xffff 1792 JE partial_success17to31 1793 ADDQ $1,DI 1794 CMPQ DI,CX 1795 JB loop17to31 1796 JMP fail 1797 partial_success17to31: 1798 MOVOU -16(BX)(DI*1), X3 1799 PCMPEQB X0, X3 1800 PMOVMSKB X3, SI 1801 CMPQ SI, $0xffff 1802 JE success 1803 ADDQ $1,DI 1804 CMPQ DI,CX 1805 JB loop17to31 1806 fail: 1807 MOVQ $-1, ret+32(FP) 1808 RET 1809 success: 1810 SUBQ s+0(FP), DI 1811 MOVQ DI, ret+32(FP) 1812 RET 1813 1814 1815 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 1816 MOVQ s+0(FP), SI 1817 MOVQ s_len+8(FP), BX 1818 MOVB c+24(FP), AL 1819 LEAQ ret+32(FP), R8 1820 JMP runtime·indexbytebody(SB) 1821 1822 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 1823 MOVQ s+0(FP), SI 1824 MOVQ s_len+8(FP), BX 1825 MOVB c+16(FP), AL 1826 LEAQ ret+24(FP), R8 1827 JMP runtime·indexbytebody(SB) 1828 1829 // input: 1830 // SI: data 1831 // BX: data len 1832 // AL: byte sought 1833 // R8: address to put result 1834 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 1835 MOVQ SI, DI 1836 1837 CMPQ BX, $16 1838 JLT small 1839 1840 CMPQ BX, $32 1841 JA avx2 1842 no_avx2: 1843 // round up to first 16-byte boundary 1844 TESTQ $15, SI 1845 JZ aligned 1846 MOVQ SI, CX 1847 ANDQ $~15, CX 1848 ADDQ $16, CX 1849 1850 // search the beginning 1851 SUBQ SI, CX 1852 REPN; SCASB 1853 JZ success 1854 1855 // DI is 16-byte aligned; get ready to search using SSE instructions 1856 aligned: 1857 // round down to last 16-byte boundary 1858 MOVQ BX, R11 1859 ADDQ SI, R11 1860 ANDQ $~15, R11 1861 1862 // shuffle X0 around so that each byte contains c 1863 MOVD AX, X0 1864 PUNPCKLBW X0, X0 1865 PUNPCKLBW X0, X0 1866 PSHUFL $0, X0, X0 1867 JMP condition 1868 1869 sse: 1870 // move the next 16-byte chunk of the buffer into X1 1871 MOVO (DI), X1 1872 // compare bytes in X0 to X1 1873 PCMPEQB X0, X1 1874 // take the top bit of each byte in X1 and put the result in DX 1875 PMOVMSKB X1, DX 1876 TESTL DX, DX 1877 JNZ ssesuccess 1878 ADDQ $16, DI 1879 1880 condition: 1881 CMPQ DI, R11 1882 JLT sse 1883 1884 // search the end 1885 MOVQ SI, CX 1886 ADDQ BX, CX 1887 SUBQ R11, CX 1888 // if CX == 0, the zero flag will be set and we'll end up 1889 // returning a false success 1890 JZ failure 1891 REPN; SCASB 1892 JZ success 1893 1894 failure: 1895 MOVQ $-1, (R8) 1896 RET 1897 1898 // handle for lengths < 16 1899 small: 1900 MOVQ BX, CX 1901 REPN; SCASB 1902 JZ success 1903 MOVQ $-1, (R8) 1904 RET 1905 1906 avx2: 1907 CMPB runtime·support_avx2(SB), $1 1908 JNE no_avx2 1909 MOVD AX, X0 1910 LEAQ -32(SI)(BX*1), R11 1911 VPBROADCASTB X0, X1 1912 avx2_loop: 1913 MOVHDU (DI), X2 1914 VPCMPEQB X1, X2, X3 1915 VPTEST X3, X3 1916 JNZ avx2success 1917 ADDQ $32, DI 1918 CMPQ DI, R11 1919 JLT avx2_loop 1920 MOVQ R11, DI 1921 MOVHDU (DI), X2 1922 VPCMPEQB X1, X2, X3 1923 VPTEST X3, X3 1924 JNZ avx2success 1925 VZEROUPPER 1926 MOVQ $-1, (R8) 1927 RET 1928 1929 avx2success: 1930 VPMOVMSKB X3, DX 1931 BSFL DX, DX 1932 SUBQ SI, DI 1933 ADDQ DI, DX 1934 MOVQ DX, (R8) 1935 VZEROUPPER 1936 RET 1937 1938 // we've found the chunk containing the byte 1939 // now just figure out which specific byte it is 1940 ssesuccess: 1941 // get the index of the least significant set bit 1942 BSFW DX, DX 1943 SUBQ SI, DI 1944 ADDQ DI, DX 1945 MOVQ DX, (R8) 1946 RET 1947 1948 success: 1949 SUBQ SI, DI 1950 SUBL $1, DI 1951 MOVQ DI, (R8) 1952 RET 1953 1954 TEXT bytes·Equal(SB),NOSPLIT,$0-49 1955 MOVQ a_len+8(FP), BX 1956 MOVQ b_len+32(FP), CX 1957 CMPQ BX, CX 1958 JNE eqret 1959 MOVQ a+0(FP), SI 1960 MOVQ b+24(FP), DI 1961 LEAQ ret+48(FP), AX 1962 JMP runtime·memeqbody(SB) 1963 eqret: 1964 MOVB $0, ret+48(FP) 1965 RET 1966 1967 TEXT runtime·fastrand1(SB), NOSPLIT, $0-4 1968 get_tls(CX) 1969 MOVQ g(CX), AX 1970 MOVQ g_m(AX), AX 1971 MOVL m_fastrand(AX), DX 1972 ADDL DX, DX 1973 MOVL DX, BX 1974 XORL $0x88888eef, DX 1975 CMOVLMI BX, DX 1976 MOVL DX, m_fastrand(AX) 1977 MOVL DX, ret+0(FP) 1978 RET 1979 1980 TEXT runtime·return0(SB), NOSPLIT, $0 1981 MOVL $0, AX 1982 RET 1983 1984 1985 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 1986 // Must obey the gcc calling convention. 1987 TEXT _cgo_topofstack(SB),NOSPLIT,$0 1988 get_tls(CX) 1989 MOVQ g(CX), AX 1990 MOVQ g_m(AX), AX 1991 MOVQ m_curg(AX), AX 1992 MOVQ (g_stack+stack_hi)(AX), AX 1993 RET 1994 1995 // The top-most function running on a goroutine 1996 // returns to goexit+PCQuantum. 1997 TEXT runtime·goexit(SB),NOSPLIT,$0-0 1998 BYTE $0x90 // NOP 1999 CALL runtime·goexit1(SB) // does not return 2000 // traceback from goexit1 must hit code range of goexit 2001 BYTE $0x90 // NOP 2002 2003 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8 2004 MOVQ addr+0(FP), AX 2005 PREFETCHT0 (AX) 2006 RET 2007 2008 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8 2009 MOVQ addr+0(FP), AX 2010 PREFETCHT1 (AX) 2011 RET 2012 2013 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8 2014 MOVQ addr+0(FP), AX 2015 PREFETCHT2 (AX) 2016 RET 2017 2018 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8 2019 MOVQ addr+0(FP), AX 2020 PREFETCHNTA (AX) 2021 RET 2022 2023 // This is called from .init_array and follows the platform, not Go, ABI. 2024 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 2025 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 2026 MOVQ runtime·lastmoduledatap(SB), AX 2027 MOVQ DI, moduledata_next(AX) 2028 MOVQ DI, runtime·lastmoduledatap(SB) 2029 POPQ R15 2030 RET