github.com/miolini/go@v0.0.0-20160405192216-fca68c8cb408/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVQ $0, AX 30 CPUID 31 MOVQ AX, SI 32 CMPQ AX, $0 33 JE nocpuinfo 34 35 // Figure out how to serialize RDTSC. 36 // On Intel processors LFENCE is enough. AMD requires MFENCE. 37 // Don't know about the rest, so let's do MFENCE. 38 CMPL BX, $0x756E6547 // "Genu" 39 JNE notintel 40 CMPL DX, $0x49656E69 // "ineI" 41 JNE notintel 42 CMPL CX, $0x6C65746E // "ntel" 43 JNE notintel 44 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 45 notintel: 46 47 // Load EAX=1 cpuid flags 48 MOVQ $1, AX 49 CPUID 50 MOVL CX, runtime·cpuid_ecx(SB) 51 MOVL DX, runtime·cpuid_edx(SB) 52 53 // Load EAX=7/ECX=0 cpuid flags 54 CMPQ SI, $7 55 JLT no7 56 MOVL $7, AX 57 MOVL $0, CX 58 CPUID 59 MOVL BX, runtime·cpuid_ebx7(SB) 60 no7: 61 // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1] 62 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 63 // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf 64 MOVL runtime·cpuid_ecx(SB), CX 65 ANDL $0x18000000, CX // check for OSXSAVE and AVX bits 66 CMPL CX, $0x18000000 67 JNE noavx 68 MOVL $0, CX 69 // For XGETBV, OSXSAVE bit is required and sufficient 70 XGETBV 71 ANDL $6, AX 72 CMPL AX, $6 // Check for OS support of YMM registers 73 JNE noavx 74 MOVB $1, runtime·support_avx(SB) 75 TESTL $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit 76 JEQ noavx2 77 MOVB $1, runtime·support_avx2(SB) 78 JMP nocpuinfo 79 noavx: 80 MOVB $0, runtime·support_avx(SB) 81 noavx2: 82 MOVB $0, runtime·support_avx2(SB) 83 nocpuinfo: 84 85 // if there is an _cgo_init, call it. 86 MOVQ _cgo_init(SB), AX 87 TESTQ AX, AX 88 JZ needtls 89 // g0 already in DI 90 MOVQ DI, CX // Win64 uses CX for first parameter 91 MOVQ $setg_gcc<>(SB), SI 92 CALL AX 93 94 // update stackguard after _cgo_init 95 MOVQ $runtime·g0(SB), CX 96 MOVQ (g_stack+stack_lo)(CX), AX 97 ADDQ $const__StackGuard, AX 98 MOVQ AX, g_stackguard0(CX) 99 MOVQ AX, g_stackguard1(CX) 100 101 #ifndef GOOS_windows 102 JMP ok 103 #endif 104 needtls: 105 #ifdef GOOS_plan9 106 // skip TLS setup on Plan 9 107 JMP ok 108 #endif 109 #ifdef GOOS_solaris 110 // skip TLS setup on Solaris 111 JMP ok 112 #endif 113 114 LEAQ runtime·m0+m_tls(SB), DI 115 CALL runtime·settls(SB) 116 117 // store through it, to make sure it works 118 get_tls(BX) 119 MOVQ $0x123, g(BX) 120 MOVQ runtime·m0+m_tls(SB), AX 121 CMPQ AX, $0x123 122 JEQ 2(PC) 123 MOVL AX, 0 // abort 124 ok: 125 // set the per-goroutine and per-mach "registers" 126 get_tls(BX) 127 LEAQ runtime·g0(SB), CX 128 MOVQ CX, g(BX) 129 LEAQ runtime·m0(SB), AX 130 131 // save m->g0 = g0 132 MOVQ CX, m_g0(AX) 133 // save m0 to g0->m 134 MOVQ AX, g_m(CX) 135 136 CLD // convention is D is always left cleared 137 CALL runtime·check(SB) 138 139 MOVL 16(SP), AX // copy argc 140 MOVL AX, 0(SP) 141 MOVQ 24(SP), AX // copy argv 142 MOVQ AX, 8(SP) 143 CALL runtime·args(SB) 144 CALL runtime·osinit(SB) 145 CALL runtime·schedinit(SB) 146 147 // create a new goroutine to start program 148 MOVQ $runtime·mainPC(SB), AX // entry 149 PUSHQ AX 150 PUSHQ $0 // arg size 151 CALL runtime·newproc(SB) 152 POPQ AX 153 POPQ AX 154 155 // start this M 156 CALL runtime·mstart(SB) 157 158 MOVL $0xf1, 0xf1 // crash 159 RET 160 161 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 162 GLOBL runtime·mainPC(SB),RODATA,$8 163 164 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 165 BYTE $0xcc 166 RET 167 168 TEXT runtime·asminit(SB),NOSPLIT,$0-0 169 // No per-thread init. 170 RET 171 172 /* 173 * go-routine 174 */ 175 176 // void gosave(Gobuf*) 177 // save state in Gobuf; setjmp 178 TEXT runtime·gosave(SB), NOSPLIT, $0-8 179 MOVQ buf+0(FP), AX // gobuf 180 LEAQ buf+0(FP), BX // caller's SP 181 MOVQ BX, gobuf_sp(AX) 182 MOVQ 0(SP), BX // caller's PC 183 MOVQ BX, gobuf_pc(AX) 184 MOVQ $0, gobuf_ret(AX) 185 MOVQ $0, gobuf_ctxt(AX) 186 MOVQ BP, gobuf_bp(AX) 187 get_tls(CX) 188 MOVQ g(CX), BX 189 MOVQ BX, gobuf_g(AX) 190 RET 191 192 // void gogo(Gobuf*) 193 // restore state from Gobuf; longjmp 194 TEXT runtime·gogo(SB), NOSPLIT, $0-8 195 MOVQ buf+0(FP), BX // gobuf 196 MOVQ gobuf_g(BX), DX 197 MOVQ 0(DX), CX // make sure g != nil 198 get_tls(CX) 199 MOVQ DX, g(CX) 200 MOVQ gobuf_sp(BX), SP // restore SP 201 MOVQ gobuf_ret(BX), AX 202 MOVQ gobuf_ctxt(BX), DX 203 MOVQ gobuf_bp(BX), BP 204 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 205 MOVQ $0, gobuf_ret(BX) 206 MOVQ $0, gobuf_ctxt(BX) 207 MOVQ $0, gobuf_bp(BX) 208 MOVQ gobuf_pc(BX), BX 209 JMP BX 210 211 // func mcall(fn func(*g)) 212 // Switch to m->g0's stack, call fn(g). 213 // Fn must never return. It should gogo(&g->sched) 214 // to keep running g. 215 TEXT runtime·mcall(SB), NOSPLIT, $0-8 216 MOVQ fn+0(FP), DI 217 218 get_tls(CX) 219 MOVQ g(CX), AX // save state in g->sched 220 MOVQ 0(SP), BX // caller's PC 221 MOVQ BX, (g_sched+gobuf_pc)(AX) 222 LEAQ fn+0(FP), BX // caller's SP 223 MOVQ BX, (g_sched+gobuf_sp)(AX) 224 MOVQ AX, (g_sched+gobuf_g)(AX) 225 MOVQ BP, (g_sched+gobuf_bp)(AX) 226 227 // switch to m->g0 & its stack, call fn 228 MOVQ g(CX), BX 229 MOVQ g_m(BX), BX 230 MOVQ m_g0(BX), SI 231 CMPQ SI, AX // if g == m->g0 call badmcall 232 JNE 3(PC) 233 MOVQ $runtime·badmcall(SB), AX 234 JMP AX 235 MOVQ SI, g(CX) // g = m->g0 236 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 237 PUSHQ AX 238 MOVQ DI, DX 239 MOVQ 0(DI), DI 240 CALL DI 241 POPQ AX 242 MOVQ $runtime·badmcall2(SB), AX 243 JMP AX 244 RET 245 246 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 247 // of the G stack. We need to distinguish the routine that 248 // lives at the bottom of the G stack from the one that lives 249 // at the top of the system stack because the one at the top of 250 // the system stack terminates the stack walk (see topofstack()). 251 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 252 RET 253 254 // func systemstack(fn func()) 255 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 256 MOVQ fn+0(FP), DI // DI = fn 257 get_tls(CX) 258 MOVQ g(CX), AX // AX = g 259 MOVQ g_m(AX), BX // BX = m 260 261 MOVQ m_gsignal(BX), DX // DX = gsignal 262 CMPQ AX, DX 263 JEQ noswitch 264 265 MOVQ m_g0(BX), DX // DX = g0 266 CMPQ AX, DX 267 JEQ noswitch 268 269 MOVQ m_curg(BX), R8 270 CMPQ AX, R8 271 JEQ switch 272 273 // Bad: g is not gsignal, not g0, not curg. What is it? 274 MOVQ $runtime·badsystemstack(SB), AX 275 CALL AX 276 277 switch: 278 // save our state in g->sched. Pretend to 279 // be systemstack_switch if the G stack is scanned. 280 MOVQ $runtime·systemstack_switch(SB), SI 281 MOVQ SI, (g_sched+gobuf_pc)(AX) 282 MOVQ SP, (g_sched+gobuf_sp)(AX) 283 MOVQ AX, (g_sched+gobuf_g)(AX) 284 MOVQ BP, (g_sched+gobuf_bp)(AX) 285 286 // switch to g0 287 MOVQ DX, g(CX) 288 MOVQ (g_sched+gobuf_sp)(DX), BX 289 // make it look like mstart called systemstack on g0, to stop traceback 290 SUBQ $8, BX 291 MOVQ $runtime·mstart(SB), DX 292 MOVQ DX, 0(BX) 293 MOVQ BX, SP 294 295 // call target function 296 MOVQ DI, DX 297 MOVQ 0(DI), DI 298 CALL DI 299 300 // switch back to g 301 get_tls(CX) 302 MOVQ g(CX), AX 303 MOVQ g_m(AX), BX 304 MOVQ m_curg(BX), AX 305 MOVQ AX, g(CX) 306 MOVQ (g_sched+gobuf_sp)(AX), SP 307 MOVQ $0, (g_sched+gobuf_sp)(AX) 308 RET 309 310 noswitch: 311 // already on m stack, just call directly 312 MOVQ DI, DX 313 MOVQ 0(DI), DI 314 CALL DI 315 RET 316 317 /* 318 * support for morestack 319 */ 320 321 // Called during function prolog when more stack is needed. 322 // 323 // The traceback routines see morestack on a g0 as being 324 // the top of a stack (for example, morestack calling newstack 325 // calling the scheduler calling newm calling gc), so we must 326 // record an argument size. For that purpose, it has no arguments. 327 TEXT runtime·morestack(SB),NOSPLIT,$0-0 328 // Cannot grow scheduler stack (m->g0). 329 get_tls(CX) 330 MOVQ g(CX), BX 331 MOVQ g_m(BX), BX 332 MOVQ m_g0(BX), SI 333 CMPQ g(CX), SI 334 JNE 2(PC) 335 INT $3 336 337 // Cannot grow signal stack (m->gsignal). 338 MOVQ m_gsignal(BX), SI 339 CMPQ g(CX), SI 340 JNE 2(PC) 341 INT $3 342 343 // Called from f. 344 // Set m->morebuf to f's caller. 345 MOVQ 8(SP), AX // f's caller's PC 346 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 347 LEAQ 16(SP), AX // f's caller's SP 348 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 349 get_tls(CX) 350 MOVQ g(CX), SI 351 MOVQ SI, (m_morebuf+gobuf_g)(BX) 352 353 // Set g->sched to context in f. 354 MOVQ 0(SP), AX // f's PC 355 MOVQ AX, (g_sched+gobuf_pc)(SI) 356 MOVQ SI, (g_sched+gobuf_g)(SI) 357 LEAQ 8(SP), AX // f's SP 358 MOVQ AX, (g_sched+gobuf_sp)(SI) 359 MOVQ DX, (g_sched+gobuf_ctxt)(SI) 360 MOVQ BP, (g_sched+gobuf_bp)(SI) 361 362 // Call newstack on m->g0's stack. 363 MOVQ m_g0(BX), BX 364 MOVQ BX, g(CX) 365 MOVQ (g_sched+gobuf_sp)(BX), SP 366 CALL runtime·newstack(SB) 367 MOVQ $0, 0x1003 // crash if newstack returns 368 RET 369 370 // morestack but not preserving ctxt. 371 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 372 MOVL $0, DX 373 JMP runtime·morestack(SB) 374 375 TEXT runtime·stackBarrier(SB),NOSPLIT,$0 376 // We came here via a RET to an overwritten return PC. 377 // AX may be live. Other registers are available. 378 379 // Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal. 380 get_tls(CX) 381 MOVQ g(CX), CX 382 MOVQ (g_stkbar+slice_array)(CX), DX 383 MOVQ g_stkbarPos(CX), BX 384 IMULQ $stkbar__size, BX // Too big for SIB. 385 MOVQ stkbar_savedLRPtr(DX)(BX*1), R8 386 MOVQ stkbar_savedLRVal(DX)(BX*1), BX 387 // Assert that we're popping the right saved LR. 388 ADDQ $8, R8 389 CMPQ R8, SP 390 JEQ 2(PC) 391 MOVL $0, 0 392 // Record that this stack barrier was hit. 393 ADDQ $1, g_stkbarPos(CX) 394 // Jump to the original return PC. 395 JMP BX 396 397 // reflectcall: call a function with the given argument list 398 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 399 // we don't have variable-sized frames, so we use a small number 400 // of constant-sized-frame functions to encode a few bits of size in the pc. 401 // Caution: ugly multiline assembly macros in your future! 402 403 #define DISPATCH(NAME,MAXSIZE) \ 404 CMPQ CX, $MAXSIZE; \ 405 JA 3(PC); \ 406 MOVQ $NAME(SB), AX; \ 407 JMP AX 408 // Note: can't just "JMP NAME(SB)" - bad inlining results. 409 410 TEXT reflect·call(SB), NOSPLIT, $0-0 411 JMP ·reflectcall(SB) 412 413 TEXT ·reflectcall(SB), NOSPLIT, $0-32 414 MOVLQZX argsize+24(FP), CX 415 // NOTE(rsc): No call16, because CALLFN needs four words 416 // of argument space to invoke callwritebarrier. 417 DISPATCH(runtime·call32, 32) 418 DISPATCH(runtime·call64, 64) 419 DISPATCH(runtime·call128, 128) 420 DISPATCH(runtime·call256, 256) 421 DISPATCH(runtime·call512, 512) 422 DISPATCH(runtime·call1024, 1024) 423 DISPATCH(runtime·call2048, 2048) 424 DISPATCH(runtime·call4096, 4096) 425 DISPATCH(runtime·call8192, 8192) 426 DISPATCH(runtime·call16384, 16384) 427 DISPATCH(runtime·call32768, 32768) 428 DISPATCH(runtime·call65536, 65536) 429 DISPATCH(runtime·call131072, 131072) 430 DISPATCH(runtime·call262144, 262144) 431 DISPATCH(runtime·call524288, 524288) 432 DISPATCH(runtime·call1048576, 1048576) 433 DISPATCH(runtime·call2097152, 2097152) 434 DISPATCH(runtime·call4194304, 4194304) 435 DISPATCH(runtime·call8388608, 8388608) 436 DISPATCH(runtime·call16777216, 16777216) 437 DISPATCH(runtime·call33554432, 33554432) 438 DISPATCH(runtime·call67108864, 67108864) 439 DISPATCH(runtime·call134217728, 134217728) 440 DISPATCH(runtime·call268435456, 268435456) 441 DISPATCH(runtime·call536870912, 536870912) 442 DISPATCH(runtime·call1073741824, 1073741824) 443 MOVQ $runtime·badreflectcall(SB), AX 444 JMP AX 445 446 #define CALLFN(NAME,MAXSIZE) \ 447 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 448 NO_LOCAL_POINTERS; \ 449 /* copy arguments to stack */ \ 450 MOVQ argptr+16(FP), SI; \ 451 MOVLQZX argsize+24(FP), CX; \ 452 MOVQ SP, DI; \ 453 REP;MOVSB; \ 454 /* call function */ \ 455 MOVQ f+8(FP), DX; \ 456 PCDATA $PCDATA_StackMapIndex, $0; \ 457 CALL (DX); \ 458 /* copy return values back */ \ 459 MOVQ argptr+16(FP), DI; \ 460 MOVLQZX argsize+24(FP), CX; \ 461 MOVLQZX retoffset+28(FP), BX; \ 462 MOVQ SP, SI; \ 463 ADDQ BX, DI; \ 464 ADDQ BX, SI; \ 465 SUBQ BX, CX; \ 466 REP;MOVSB; \ 467 /* execute write barrier updates */ \ 468 MOVQ argtype+0(FP), DX; \ 469 MOVQ argptr+16(FP), DI; \ 470 MOVLQZX argsize+24(FP), CX; \ 471 MOVLQZX retoffset+28(FP), BX; \ 472 MOVQ DX, 0(SP); \ 473 MOVQ DI, 8(SP); \ 474 MOVQ CX, 16(SP); \ 475 MOVQ BX, 24(SP); \ 476 CALL runtime·callwritebarrier(SB); \ 477 RET 478 479 CALLFN(·call32, 32) 480 CALLFN(·call64, 64) 481 CALLFN(·call128, 128) 482 CALLFN(·call256, 256) 483 CALLFN(·call512, 512) 484 CALLFN(·call1024, 1024) 485 CALLFN(·call2048, 2048) 486 CALLFN(·call4096, 4096) 487 CALLFN(·call8192, 8192) 488 CALLFN(·call16384, 16384) 489 CALLFN(·call32768, 32768) 490 CALLFN(·call65536, 65536) 491 CALLFN(·call131072, 131072) 492 CALLFN(·call262144, 262144) 493 CALLFN(·call524288, 524288) 494 CALLFN(·call1048576, 1048576) 495 CALLFN(·call2097152, 2097152) 496 CALLFN(·call4194304, 4194304) 497 CALLFN(·call8388608, 8388608) 498 CALLFN(·call16777216, 16777216) 499 CALLFN(·call33554432, 33554432) 500 CALLFN(·call67108864, 67108864) 501 CALLFN(·call134217728, 134217728) 502 CALLFN(·call268435456, 268435456) 503 CALLFN(·call536870912, 536870912) 504 CALLFN(·call1073741824, 1073741824) 505 506 TEXT runtime·procyield(SB),NOSPLIT,$0-0 507 MOVL cycles+0(FP), AX 508 again: 509 PAUSE 510 SUBL $1, AX 511 JNZ again 512 RET 513 514 515 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 516 // Stores are already ordered on x86, so this is just a 517 // compile barrier. 518 RET 519 520 // void jmpdefer(fn, sp); 521 // called from deferreturn. 522 // 1. pop the caller 523 // 2. sub 5 bytes from the callers return 524 // 3. jmp to the argument 525 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 526 MOVQ fv+0(FP), DX // fn 527 MOVQ argp+8(FP), BX // caller sp 528 LEAQ -8(BX), SP // caller sp after CALL 529 SUBQ $5, (SP) // return to CALL again 530 MOVQ 0(DX), BX 531 JMP BX // but first run the deferred function 532 533 // Save state of caller into g->sched. Smashes R8, R9. 534 TEXT gosave<>(SB),NOSPLIT,$0 535 get_tls(R8) 536 MOVQ g(R8), R8 537 MOVQ 0(SP), R9 538 MOVQ R9, (g_sched+gobuf_pc)(R8) 539 LEAQ 8(SP), R9 540 MOVQ R9, (g_sched+gobuf_sp)(R8) 541 MOVQ $0, (g_sched+gobuf_ret)(R8) 542 MOVQ $0, (g_sched+gobuf_ctxt)(R8) 543 MOVQ BP, (g_sched+gobuf_bp)(R8) 544 RET 545 546 // func asmcgocall(fn, arg unsafe.Pointer) int32 547 // Call fn(arg) on the scheduler stack, 548 // aligned appropriately for the gcc ABI. 549 // See cgocall.go for more details. 550 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 551 MOVQ fn+0(FP), AX 552 MOVQ arg+8(FP), BX 553 554 MOVQ SP, DX 555 556 // Figure out if we need to switch to m->g0 stack. 557 // We get called to create new OS threads too, and those 558 // come in on the m->g0 stack already. 559 get_tls(CX) 560 MOVQ g(CX), R8 561 CMPQ R8, $0 562 JEQ nosave 563 MOVQ g_m(R8), R8 564 MOVQ m_g0(R8), SI 565 MOVQ g(CX), DI 566 CMPQ SI, DI 567 JEQ nosave 568 MOVQ m_gsignal(R8), SI 569 CMPQ SI, DI 570 JEQ nosave 571 572 // Switch to system stack. 573 MOVQ m_g0(R8), SI 574 CALL gosave<>(SB) 575 MOVQ SI, g(CX) 576 MOVQ (g_sched+gobuf_sp)(SI), SP 577 578 // Now on a scheduling stack (a pthread-created stack). 579 // Make sure we have enough room for 4 stack-backed fast-call 580 // registers as per windows amd64 calling convention. 581 SUBQ $64, SP 582 ANDQ $~15, SP // alignment for gcc ABI 583 MOVQ DI, 48(SP) // save g 584 MOVQ (g_stack+stack_hi)(DI), DI 585 SUBQ DX, DI 586 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 587 MOVQ BX, DI // DI = first argument in AMD64 ABI 588 MOVQ BX, CX // CX = first argument in Win64 589 CALL AX 590 591 // Restore registers, g, stack pointer. 592 get_tls(CX) 593 MOVQ 48(SP), DI 594 MOVQ (g_stack+stack_hi)(DI), SI 595 SUBQ 40(SP), SI 596 MOVQ DI, g(CX) 597 MOVQ SI, SP 598 599 MOVL AX, ret+16(FP) 600 RET 601 602 nosave: 603 // Running on a system stack, perhaps even without a g. 604 // Having no g can happen during thread creation or thread teardown 605 // (see needm/dropm on Solaris, for example). 606 // This code is like the above sequence but without saving/restoring g 607 // and without worrying about the stack moving out from under us 608 // (because we're on a system stack, not a goroutine stack). 609 // The above code could be used directly if already on a system stack, 610 // but then the only path through this code would be a rare case on Solaris. 611 // Using this code for all "already on system stack" calls exercises it more, 612 // which should help keep it correct. 613 SUBQ $64, SP 614 ANDQ $~15, SP 615 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging 616 MOVQ DX, 40(SP) // save original stack pointer 617 MOVQ BX, DI // DI = first argument in AMD64 ABI 618 MOVQ BX, CX // CX = first argument in Win64 619 CALL AX 620 MOVQ 40(SP), SI // restore original stack pointer 621 MOVQ SI, SP 622 MOVL AX, ret+16(FP) 623 RET 624 625 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize) 626 // Turn the fn into a Go func (by taking its address) and call 627 // cgocallback_gofunc. 628 TEXT runtime·cgocallback(SB),NOSPLIT,$24-24 629 LEAQ fn+0(FP), AX 630 MOVQ AX, 0(SP) 631 MOVQ frame+8(FP), AX 632 MOVQ AX, 8(SP) 633 MOVQ framesize+16(FP), AX 634 MOVQ AX, 16(SP) 635 MOVQ $runtime·cgocallback_gofunc(SB), AX 636 CALL AX 637 RET 638 639 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) 640 // See cgocall.go for more details. 641 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24 642 NO_LOCAL_POINTERS 643 644 // If g is nil, Go did not create the current thread. 645 // Call needm to obtain one m for temporary use. 646 // In this case, we're running on the thread stack, so there's 647 // lots of space, but the linker doesn't know. Hide the call from 648 // the linker analysis by using an indirect call through AX. 649 get_tls(CX) 650 #ifdef GOOS_windows 651 MOVL $0, BX 652 CMPQ CX, $0 653 JEQ 2(PC) 654 #endif 655 MOVQ g(CX), BX 656 CMPQ BX, $0 657 JEQ needm 658 MOVQ g_m(BX), BX 659 MOVQ BX, R8 // holds oldm until end of function 660 JMP havem 661 needm: 662 MOVQ $0, 0(SP) 663 MOVQ $runtime·needm(SB), AX 664 CALL AX 665 MOVQ 0(SP), R8 666 get_tls(CX) 667 MOVQ g(CX), BX 668 MOVQ g_m(BX), BX 669 670 // Set m->sched.sp = SP, so that if a panic happens 671 // during the function we are about to execute, it will 672 // have a valid SP to run on the g0 stack. 673 // The next few lines (after the havem label) 674 // will save this SP onto the stack and then write 675 // the same SP back to m->sched.sp. That seems redundant, 676 // but if an unrecovered panic happens, unwindm will 677 // restore the g->sched.sp from the stack location 678 // and then systemstack will try to use it. If we don't set it here, 679 // that restored SP will be uninitialized (typically 0) and 680 // will not be usable. 681 MOVQ m_g0(BX), SI 682 MOVQ SP, (g_sched+gobuf_sp)(SI) 683 684 havem: 685 // Now there's a valid m, and we're running on its m->g0. 686 // Save current m->g0->sched.sp on stack and then set it to SP. 687 // Save current sp in m->g0->sched.sp in preparation for 688 // switch back to m->curg stack. 689 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 690 MOVQ m_g0(BX), SI 691 MOVQ (g_sched+gobuf_sp)(SI), AX 692 MOVQ AX, 0(SP) 693 MOVQ SP, (g_sched+gobuf_sp)(SI) 694 695 // Switch to m->curg stack and call runtime.cgocallbackg. 696 // Because we are taking over the execution of m->curg 697 // but *not* resuming what had been running, we need to 698 // save that information (m->curg->sched) so we can restore it. 699 // We can restore m->curg->sched.sp easily, because calling 700 // runtime.cgocallbackg leaves SP unchanged upon return. 701 // To save m->curg->sched.pc, we push it onto the stack. 702 // This has the added benefit that it looks to the traceback 703 // routine like cgocallbackg is going to return to that 704 // PC (because the frame we allocate below has the same 705 // size as cgocallback_gofunc's frame declared above) 706 // so that the traceback will seamlessly trace back into 707 // the earlier calls. 708 // 709 // In the new goroutine, 0(SP) holds the saved R8. 710 MOVQ m_curg(BX), SI 711 MOVQ SI, g(CX) 712 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 713 MOVQ (g_sched+gobuf_pc)(SI), BX 714 MOVQ BX, -8(DI) 715 // Compute the size of the frame, including return PC and, if 716 // GOEXPERIMENT=framepointer, the saved based pointer 717 LEAQ fv+0(FP), AX 718 SUBQ SP, AX 719 SUBQ AX, DI 720 MOVQ DI, SP 721 722 MOVQ R8, 0(SP) 723 CALL runtime·cgocallbackg(SB) 724 MOVQ 0(SP), R8 725 726 // Compute the size of the frame again. FP and SP have 727 // completely different values here than they did above, 728 // but only their difference matters. 729 LEAQ fv+0(FP), AX 730 SUBQ SP, AX 731 732 // Restore g->sched (== m->curg->sched) from saved values. 733 get_tls(CX) 734 MOVQ g(CX), SI 735 MOVQ SP, DI 736 ADDQ AX, DI 737 MOVQ -8(DI), BX 738 MOVQ BX, (g_sched+gobuf_pc)(SI) 739 MOVQ DI, (g_sched+gobuf_sp)(SI) 740 741 // Switch back to m->g0's stack and restore m->g0->sched.sp. 742 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 743 // so we do not have to restore it.) 744 MOVQ g(CX), BX 745 MOVQ g_m(BX), BX 746 MOVQ m_g0(BX), SI 747 MOVQ SI, g(CX) 748 MOVQ (g_sched+gobuf_sp)(SI), SP 749 MOVQ 0(SP), AX 750 MOVQ AX, (g_sched+gobuf_sp)(SI) 751 752 // If the m on entry was nil, we called needm above to borrow an m 753 // for the duration of the call. Since the call is over, return it with dropm. 754 CMPQ R8, $0 755 JNE 3(PC) 756 MOVQ $runtime·dropm(SB), AX 757 CALL AX 758 759 // Done! 760 RET 761 762 // void setg(G*); set g. for use by needm. 763 TEXT runtime·setg(SB), NOSPLIT, $0-8 764 MOVQ gg+0(FP), BX 765 #ifdef GOOS_windows 766 CMPQ BX, $0 767 JNE settls 768 MOVQ $0, 0x28(GS) 769 RET 770 settls: 771 MOVQ g_m(BX), AX 772 LEAQ m_tls(AX), AX 773 MOVQ AX, 0x28(GS) 774 #endif 775 get_tls(CX) 776 MOVQ BX, g(CX) 777 RET 778 779 // void setg_gcc(G*); set g called from gcc. 780 TEXT setg_gcc<>(SB),NOSPLIT,$0 781 get_tls(AX) 782 MOVQ DI, g(AX) 783 RET 784 785 // check that SP is in range [g->stack.lo, g->stack.hi) 786 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 787 get_tls(CX) 788 MOVQ g(CX), AX 789 CMPQ (g_stack+stack_hi)(AX), SP 790 JHI 2(PC) 791 INT $3 792 CMPQ SP, (g_stack+stack_lo)(AX) 793 JHI 2(PC) 794 INT $3 795 RET 796 797 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 798 MOVQ argp+0(FP),AX // addr of first arg 799 MOVQ -8(AX),AX // get calling pc 800 CMPQ AX, runtime·stackBarrierPC(SB) 801 JNE nobar 802 // Get original return PC. 803 CALL runtime·nextBarrierPC(SB) 804 MOVQ 0(SP), AX 805 nobar: 806 MOVQ AX, ret+8(FP) 807 RET 808 809 TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16 810 MOVQ argp+0(FP),AX // addr of first arg 811 MOVQ pc+8(FP), BX 812 MOVQ -8(AX), CX 813 CMPQ CX, runtime·stackBarrierPC(SB) 814 JEQ setbar 815 MOVQ BX, -8(AX) // set calling pc 816 RET 817 setbar: 818 // Set the stack barrier return PC. 819 MOVQ BX, 0(SP) 820 CALL runtime·setNextBarrierPC(SB) 821 RET 822 823 TEXT runtime·getcallersp(SB),NOSPLIT,$0-16 824 MOVQ argp+0(FP), AX 825 MOVQ AX, ret+8(FP) 826 RET 827 828 // func cputicks() int64 829 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 830 CMPB runtime·lfenceBeforeRdtsc(SB), $1 831 JNE mfence 832 LFENCE 833 JMP done 834 mfence: 835 MFENCE 836 done: 837 RDTSC 838 SHLQ $32, DX 839 ADDQ DX, AX 840 MOVQ AX, ret+0(FP) 841 RET 842 843 // memhash_varlen(p unsafe.Pointer, h seed) uintptr 844 // redirects to memhash(p, h, size) using the size 845 // stored in the closure. 846 TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24 847 GO_ARGS 848 NO_LOCAL_POINTERS 849 MOVQ p+0(FP), AX 850 MOVQ h+8(FP), BX 851 MOVQ 8(DX), CX 852 MOVQ AX, 0(SP) 853 MOVQ BX, 8(SP) 854 MOVQ CX, 16(SP) 855 CALL runtime·memhash(SB) 856 MOVQ 24(SP), AX 857 MOVQ AX, ret+16(FP) 858 RET 859 860 // hash function using AES hardware instructions 861 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 862 MOVQ p+0(FP), AX // ptr to data 863 MOVQ s+16(FP), CX // size 864 LEAQ ret+24(FP), DX 865 JMP runtime·aeshashbody(SB) 866 867 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 868 MOVQ p+0(FP), AX // ptr to string struct 869 MOVQ 8(AX), CX // length of string 870 MOVQ (AX), AX // string data 871 LEAQ ret+16(FP), DX 872 JMP runtime·aeshashbody(SB) 873 874 // AX: data 875 // CX: length 876 // DX: address to put return value 877 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 878 // Fill an SSE register with our seeds. 879 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 880 PINSRW $4, CX, X0 // 16 bits of length 881 PSHUFHW $0, X0, X0 // repeat length 4 times total 882 MOVO X0, X1 // save unscrambled seed 883 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 884 AESENC X0, X0 // scramble seed 885 886 CMPQ CX, $16 887 JB aes0to15 888 JE aes16 889 CMPQ CX, $32 890 JBE aes17to32 891 CMPQ CX, $64 892 JBE aes33to64 893 CMPQ CX, $128 894 JBE aes65to128 895 JMP aes129plus 896 897 aes0to15: 898 TESTQ CX, CX 899 JE aes0 900 901 ADDQ $16, AX 902 TESTW $0xff0, AX 903 JE endofpage 904 905 // 16 bytes loaded at this address won't cross 906 // a page boundary, so we can load it directly. 907 MOVOU -16(AX), X1 908 ADDQ CX, CX 909 MOVQ $masks<>(SB), AX 910 PAND (AX)(CX*8), X1 911 final1: 912 AESENC X0, X1 // scramble input, xor in seed 913 AESENC X1, X1 // scramble combo 2 times 914 AESENC X1, X1 915 MOVQ X1, (DX) 916 RET 917 918 endofpage: 919 // address ends in 1111xxxx. Might be up against 920 // a page boundary, so load ending at last byte. 921 // Then shift bytes down using pshufb. 922 MOVOU -32(AX)(CX*1), X1 923 ADDQ CX, CX 924 MOVQ $shifts<>(SB), AX 925 PSHUFB (AX)(CX*8), X1 926 JMP final1 927 928 aes0: 929 // Return scrambled input seed 930 AESENC X0, X0 931 MOVQ X0, (DX) 932 RET 933 934 aes16: 935 MOVOU (AX), X1 936 JMP final1 937 938 aes17to32: 939 // make second starting seed 940 PXOR runtime·aeskeysched+16(SB), X1 941 AESENC X1, X1 942 943 // load data to be hashed 944 MOVOU (AX), X2 945 MOVOU -16(AX)(CX*1), X3 946 947 // scramble 3 times 948 AESENC X0, X2 949 AESENC X1, X3 950 AESENC X2, X2 951 AESENC X3, X3 952 AESENC X2, X2 953 AESENC X3, X3 954 955 // combine results 956 PXOR X3, X2 957 MOVQ X2, (DX) 958 RET 959 960 aes33to64: 961 // make 3 more starting seeds 962 MOVO X1, X2 963 MOVO X1, X3 964 PXOR runtime·aeskeysched+16(SB), X1 965 PXOR runtime·aeskeysched+32(SB), X2 966 PXOR runtime·aeskeysched+48(SB), X3 967 AESENC X1, X1 968 AESENC X2, X2 969 AESENC X3, X3 970 971 MOVOU (AX), X4 972 MOVOU 16(AX), X5 973 MOVOU -32(AX)(CX*1), X6 974 MOVOU -16(AX)(CX*1), X7 975 976 AESENC X0, X4 977 AESENC X1, X5 978 AESENC X2, X6 979 AESENC X3, X7 980 981 AESENC X4, X4 982 AESENC X5, X5 983 AESENC X6, X6 984 AESENC X7, X7 985 986 AESENC X4, X4 987 AESENC X5, X5 988 AESENC X6, X6 989 AESENC X7, X7 990 991 PXOR X6, X4 992 PXOR X7, X5 993 PXOR X5, X4 994 MOVQ X4, (DX) 995 RET 996 997 aes65to128: 998 // make 7 more starting seeds 999 MOVO X1, X2 1000 MOVO X1, X3 1001 MOVO X1, X4 1002 MOVO X1, X5 1003 MOVO X1, X6 1004 MOVO X1, X7 1005 PXOR runtime·aeskeysched+16(SB), X1 1006 PXOR runtime·aeskeysched+32(SB), X2 1007 PXOR runtime·aeskeysched+48(SB), X3 1008 PXOR runtime·aeskeysched+64(SB), X4 1009 PXOR runtime·aeskeysched+80(SB), X5 1010 PXOR runtime·aeskeysched+96(SB), X6 1011 PXOR runtime·aeskeysched+112(SB), X7 1012 AESENC X1, X1 1013 AESENC X2, X2 1014 AESENC X3, X3 1015 AESENC X4, X4 1016 AESENC X5, X5 1017 AESENC X6, X6 1018 AESENC X7, X7 1019 1020 // load data 1021 MOVOU (AX), X8 1022 MOVOU 16(AX), X9 1023 MOVOU 32(AX), X10 1024 MOVOU 48(AX), X11 1025 MOVOU -64(AX)(CX*1), X12 1026 MOVOU -48(AX)(CX*1), X13 1027 MOVOU -32(AX)(CX*1), X14 1028 MOVOU -16(AX)(CX*1), X15 1029 1030 // scramble data, xor in seed 1031 AESENC X0, X8 1032 AESENC X1, X9 1033 AESENC X2, X10 1034 AESENC X3, X11 1035 AESENC X4, X12 1036 AESENC X5, X13 1037 AESENC X6, X14 1038 AESENC X7, X15 1039 1040 // scramble twice 1041 AESENC X8, X8 1042 AESENC X9, X9 1043 AESENC X10, X10 1044 AESENC X11, X11 1045 AESENC X12, X12 1046 AESENC X13, X13 1047 AESENC X14, X14 1048 AESENC X15, X15 1049 1050 AESENC X8, X8 1051 AESENC X9, X9 1052 AESENC X10, X10 1053 AESENC X11, X11 1054 AESENC X12, X12 1055 AESENC X13, X13 1056 AESENC X14, X14 1057 AESENC X15, X15 1058 1059 // combine results 1060 PXOR X12, X8 1061 PXOR X13, X9 1062 PXOR X14, X10 1063 PXOR X15, X11 1064 PXOR X10, X8 1065 PXOR X11, X9 1066 PXOR X9, X8 1067 MOVQ X8, (DX) 1068 RET 1069 1070 aes129plus: 1071 // make 7 more starting seeds 1072 MOVO X1, X2 1073 MOVO X1, X3 1074 MOVO X1, X4 1075 MOVO X1, X5 1076 MOVO X1, X6 1077 MOVO X1, X7 1078 PXOR runtime·aeskeysched+16(SB), X1 1079 PXOR runtime·aeskeysched+32(SB), X2 1080 PXOR runtime·aeskeysched+48(SB), X3 1081 PXOR runtime·aeskeysched+64(SB), X4 1082 PXOR runtime·aeskeysched+80(SB), X5 1083 PXOR runtime·aeskeysched+96(SB), X6 1084 PXOR runtime·aeskeysched+112(SB), X7 1085 AESENC X1, X1 1086 AESENC X2, X2 1087 AESENC X3, X3 1088 AESENC X4, X4 1089 AESENC X5, X5 1090 AESENC X6, X6 1091 AESENC X7, X7 1092 1093 // start with last (possibly overlapping) block 1094 MOVOU -128(AX)(CX*1), X8 1095 MOVOU -112(AX)(CX*1), X9 1096 MOVOU -96(AX)(CX*1), X10 1097 MOVOU -80(AX)(CX*1), X11 1098 MOVOU -64(AX)(CX*1), X12 1099 MOVOU -48(AX)(CX*1), X13 1100 MOVOU -32(AX)(CX*1), X14 1101 MOVOU -16(AX)(CX*1), X15 1102 1103 // scramble input once, xor in seed 1104 AESENC X0, X8 1105 AESENC X1, X9 1106 AESENC X2, X10 1107 AESENC X3, X11 1108 AESENC X4, X12 1109 AESENC X5, X13 1110 AESENC X6, X14 1111 AESENC X7, X15 1112 1113 // compute number of remaining 128-byte blocks 1114 DECQ CX 1115 SHRQ $7, CX 1116 1117 aesloop: 1118 // scramble state, xor in a block 1119 MOVOU (AX), X0 1120 MOVOU 16(AX), X1 1121 MOVOU 32(AX), X2 1122 MOVOU 48(AX), X3 1123 AESENC X0, X8 1124 AESENC X1, X9 1125 AESENC X2, X10 1126 AESENC X3, X11 1127 MOVOU 64(AX), X4 1128 MOVOU 80(AX), X5 1129 MOVOU 96(AX), X6 1130 MOVOU 112(AX), X7 1131 AESENC X4, X12 1132 AESENC X5, X13 1133 AESENC X6, X14 1134 AESENC X7, X15 1135 1136 // scramble state 1137 AESENC X8, X8 1138 AESENC X9, X9 1139 AESENC X10, X10 1140 AESENC X11, X11 1141 AESENC X12, X12 1142 AESENC X13, X13 1143 AESENC X14, X14 1144 AESENC X15, X15 1145 1146 ADDQ $128, AX 1147 DECQ CX 1148 JNE aesloop 1149 1150 // 2 more scrambles to finish 1151 AESENC X8, X8 1152 AESENC X9, X9 1153 AESENC X10, X10 1154 AESENC X11, X11 1155 AESENC X12, X12 1156 AESENC X13, X13 1157 AESENC X14, X14 1158 AESENC X15, X15 1159 AESENC X8, X8 1160 AESENC X9, X9 1161 AESENC X10, X10 1162 AESENC X11, X11 1163 AESENC X12, X12 1164 AESENC X13, X13 1165 AESENC X14, X14 1166 AESENC X15, X15 1167 1168 PXOR X12, X8 1169 PXOR X13, X9 1170 PXOR X14, X10 1171 PXOR X15, X11 1172 PXOR X10, X8 1173 PXOR X11, X9 1174 PXOR X9, X8 1175 MOVQ X8, (DX) 1176 RET 1177 1178 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1179 MOVQ p+0(FP), AX // ptr to data 1180 MOVQ h+8(FP), X0 // seed 1181 PINSRD $2, (AX), X0 // data 1182 AESENC runtime·aeskeysched+0(SB), X0 1183 AESENC runtime·aeskeysched+16(SB), X0 1184 AESENC runtime·aeskeysched+32(SB), X0 1185 MOVQ X0, ret+16(FP) 1186 RET 1187 1188 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1189 MOVQ p+0(FP), AX // ptr to data 1190 MOVQ h+8(FP), X0 // seed 1191 PINSRQ $1, (AX), X0 // data 1192 AESENC runtime·aeskeysched+0(SB), X0 1193 AESENC runtime·aeskeysched+16(SB), X0 1194 AESENC runtime·aeskeysched+32(SB), X0 1195 MOVQ X0, ret+16(FP) 1196 RET 1197 1198 // simple mask to get rid of data in the high part of the register. 1199 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1200 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1201 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1202 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1203 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1204 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1205 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1206 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1207 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1208 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1209 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1210 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1211 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1212 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1213 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1214 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1215 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1216 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1217 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1218 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1219 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1220 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1221 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1222 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1223 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1224 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1225 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1226 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1227 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1228 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1229 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1230 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1231 GLOBL masks<>(SB),RODATA,$256 1232 1233 TEXT ·checkASM(SB),NOSPLIT,$0-1 1234 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte 1235 MOVQ $masks<>(SB), AX 1236 MOVQ $shifts<>(SB), BX 1237 ORQ BX, AX 1238 TESTQ $15, AX 1239 SETEQ ret+0(FP) 1240 RET 1241 1242 // these are arguments to pshufb. They move data down from 1243 // the high bytes of the register to the low bytes of the register. 1244 // index is how many bytes to move. 1245 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1246 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1247 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1248 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1249 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1250 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1251 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1252 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1253 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1254 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1255 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1256 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1257 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1258 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1259 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1260 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1261 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1262 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1263 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1264 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1265 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1266 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1267 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1268 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1269 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1270 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1271 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1272 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1273 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1274 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1275 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1276 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1277 GLOBL shifts<>(SB),RODATA,$256 1278 1279 // memequal(p, q unsafe.Pointer, size uintptr) bool 1280 TEXT runtime·memequal(SB),NOSPLIT,$0-25 1281 MOVQ a+0(FP), SI 1282 MOVQ b+8(FP), DI 1283 CMPQ SI, DI 1284 JEQ eq 1285 MOVQ size+16(FP), BX 1286 LEAQ ret+24(FP), AX 1287 JMP runtime·memeqbody(SB) 1288 eq: 1289 MOVB $1, ret+24(FP) 1290 RET 1291 1292 // memequal_varlen(a, b unsafe.Pointer) bool 1293 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1294 MOVQ a+0(FP), SI 1295 MOVQ b+8(FP), DI 1296 CMPQ SI, DI 1297 JEQ eq 1298 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1299 LEAQ ret+16(FP), AX 1300 JMP runtime·memeqbody(SB) 1301 eq: 1302 MOVB $1, ret+16(FP) 1303 RET 1304 1305 // eqstring tests whether two strings are equal. 1306 // The compiler guarantees that strings passed 1307 // to eqstring have equal length. 1308 // See runtime_test.go:eqstring_generic for 1309 // equivalent Go code. 1310 TEXT runtime·eqstring(SB),NOSPLIT,$0-33 1311 MOVQ s1str+0(FP), SI 1312 MOVQ s2str+16(FP), DI 1313 CMPQ SI, DI 1314 JEQ eq 1315 MOVQ s1len+8(FP), BX 1316 LEAQ v+32(FP), AX 1317 JMP runtime·memeqbody(SB) 1318 eq: 1319 MOVB $1, v+32(FP) 1320 RET 1321 1322 // a in SI 1323 // b in DI 1324 // count in BX 1325 // address of result byte in AX 1326 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1327 CMPQ BX, $8 1328 JB small 1329 CMPQ BX, $64 1330 JB bigloop 1331 CMPB runtime·support_avx2(SB), $1 1332 JE hugeloop_avx2 1333 1334 // 64 bytes at a time using xmm registers 1335 hugeloop: 1336 CMPQ BX, $64 1337 JB bigloop 1338 MOVOU (SI), X0 1339 MOVOU (DI), X1 1340 MOVOU 16(SI), X2 1341 MOVOU 16(DI), X3 1342 MOVOU 32(SI), X4 1343 MOVOU 32(DI), X5 1344 MOVOU 48(SI), X6 1345 MOVOU 48(DI), X7 1346 PCMPEQB X1, X0 1347 PCMPEQB X3, X2 1348 PCMPEQB X5, X4 1349 PCMPEQB X7, X6 1350 PAND X2, X0 1351 PAND X6, X4 1352 PAND X4, X0 1353 PMOVMSKB X0, DX 1354 ADDQ $64, SI 1355 ADDQ $64, DI 1356 SUBQ $64, BX 1357 CMPL DX, $0xffff 1358 JEQ hugeloop 1359 MOVB $0, (AX) 1360 RET 1361 1362 // 64 bytes at a time using ymm registers 1363 hugeloop_avx2: 1364 CMPQ BX, $64 1365 JB bigloop_avx2 1366 VMOVDQU (SI), Y0 1367 VMOVDQU (DI), Y1 1368 VMOVDQU 32(SI), Y2 1369 VMOVDQU 32(DI), Y3 1370 VPCMPEQB Y1, Y0, Y4 1371 VPCMPEQB Y2, Y3, Y5 1372 VPAND Y4, Y5, Y6 1373 VPMOVMSKB Y6, DX 1374 ADDQ $64, SI 1375 ADDQ $64, DI 1376 SUBQ $64, BX 1377 CMPL DX, $0xffffffff 1378 JEQ hugeloop_avx2 1379 VZEROUPPER 1380 MOVB $0, (AX) 1381 RET 1382 1383 bigloop_avx2: 1384 VZEROUPPER 1385 1386 // 8 bytes at a time using 64-bit register 1387 bigloop: 1388 CMPQ BX, $8 1389 JBE leftover 1390 MOVQ (SI), CX 1391 MOVQ (DI), DX 1392 ADDQ $8, SI 1393 ADDQ $8, DI 1394 SUBQ $8, BX 1395 CMPQ CX, DX 1396 JEQ bigloop 1397 MOVB $0, (AX) 1398 RET 1399 1400 // remaining 0-8 bytes 1401 leftover: 1402 MOVQ -8(SI)(BX*1), CX 1403 MOVQ -8(DI)(BX*1), DX 1404 CMPQ CX, DX 1405 SETEQ (AX) 1406 RET 1407 1408 small: 1409 CMPQ BX, $0 1410 JEQ equal 1411 1412 LEAQ 0(BX*8), CX 1413 NEGQ CX 1414 1415 CMPB SI, $0xf8 1416 JA si_high 1417 1418 // load at SI won't cross a page boundary. 1419 MOVQ (SI), SI 1420 JMP si_finish 1421 si_high: 1422 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1423 MOVQ -8(SI)(BX*1), SI 1424 SHRQ CX, SI 1425 si_finish: 1426 1427 // same for DI. 1428 CMPB DI, $0xf8 1429 JA di_high 1430 MOVQ (DI), DI 1431 JMP di_finish 1432 di_high: 1433 MOVQ -8(DI)(BX*1), DI 1434 SHRQ CX, DI 1435 di_finish: 1436 1437 SUBQ SI, DI 1438 SHLQ CX, DI 1439 equal: 1440 SETEQ (AX) 1441 RET 1442 1443 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1444 MOVQ s1_base+0(FP), SI 1445 MOVQ s1_len+8(FP), BX 1446 MOVQ s2_base+16(FP), DI 1447 MOVQ s2_len+24(FP), DX 1448 LEAQ ret+32(FP), R9 1449 JMP runtime·cmpbody(SB) 1450 1451 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1452 MOVQ s1+0(FP), SI 1453 MOVQ s1+8(FP), BX 1454 MOVQ s2+24(FP), DI 1455 MOVQ s2+32(FP), DX 1456 LEAQ res+48(FP), R9 1457 JMP runtime·cmpbody(SB) 1458 1459 // input: 1460 // SI = a 1461 // DI = b 1462 // BX = alen 1463 // DX = blen 1464 // R9 = address of output word (stores -1/0/1 here) 1465 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1466 CMPQ SI, DI 1467 JEQ allsame 1468 CMPQ BX, DX 1469 MOVQ DX, R8 1470 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1471 CMPQ R8, $8 1472 JB small 1473 1474 CMPQ R8, $63 1475 JBE loop 1476 CMPB runtime·support_avx2(SB), $1 1477 JEQ big_loop_avx2 1478 JMP big_loop 1479 loop: 1480 CMPQ R8, $16 1481 JBE _0through16 1482 MOVOU (SI), X0 1483 MOVOU (DI), X1 1484 PCMPEQB X0, X1 1485 PMOVMSKB X1, AX 1486 XORQ $0xffff, AX // convert EQ to NE 1487 JNE diff16 // branch if at least one byte is not equal 1488 ADDQ $16, SI 1489 ADDQ $16, DI 1490 SUBQ $16, R8 1491 JMP loop 1492 1493 diff64: 1494 ADDQ $48, SI 1495 ADDQ $48, DI 1496 JMP diff16 1497 diff48: 1498 ADDQ $32, SI 1499 ADDQ $32, DI 1500 JMP diff16 1501 diff32: 1502 ADDQ $16, SI 1503 ADDQ $16, DI 1504 // AX = bit mask of differences 1505 diff16: 1506 BSFQ AX, BX // index of first byte that differs 1507 XORQ AX, AX 1508 MOVB (SI)(BX*1), CX 1509 CMPB CX, (DI)(BX*1) 1510 SETHI AX 1511 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1512 MOVQ AX, (R9) 1513 RET 1514 1515 // 0 through 16 bytes left, alen>=8, blen>=8 1516 _0through16: 1517 CMPQ R8, $8 1518 JBE _0through8 1519 MOVQ (SI), AX 1520 MOVQ (DI), CX 1521 CMPQ AX, CX 1522 JNE diff8 1523 _0through8: 1524 MOVQ -8(SI)(R8*1), AX 1525 MOVQ -8(DI)(R8*1), CX 1526 CMPQ AX, CX 1527 JEQ allsame 1528 1529 // AX and CX contain parts of a and b that differ. 1530 diff8: 1531 BSWAPQ AX // reverse order of bytes 1532 BSWAPQ CX 1533 XORQ AX, CX 1534 BSRQ CX, CX // index of highest bit difference 1535 SHRQ CX, AX // move a's bit to bottom 1536 ANDQ $1, AX // mask bit 1537 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1538 MOVQ AX, (R9) 1539 RET 1540 1541 // 0-7 bytes in common 1542 small: 1543 LEAQ (R8*8), CX // bytes left -> bits left 1544 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1545 JEQ allsame 1546 1547 // load bytes of a into high bytes of AX 1548 CMPB SI, $0xf8 1549 JA si_high 1550 MOVQ (SI), SI 1551 JMP si_finish 1552 si_high: 1553 MOVQ -8(SI)(R8*1), SI 1554 SHRQ CX, SI 1555 si_finish: 1556 SHLQ CX, SI 1557 1558 // load bytes of b in to high bytes of BX 1559 CMPB DI, $0xf8 1560 JA di_high 1561 MOVQ (DI), DI 1562 JMP di_finish 1563 di_high: 1564 MOVQ -8(DI)(R8*1), DI 1565 SHRQ CX, DI 1566 di_finish: 1567 SHLQ CX, DI 1568 1569 BSWAPQ SI // reverse order of bytes 1570 BSWAPQ DI 1571 XORQ SI, DI // find bit differences 1572 JEQ allsame 1573 BSRQ DI, CX // index of highest bit difference 1574 SHRQ CX, SI // move a's bit to bottom 1575 ANDQ $1, SI // mask bit 1576 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1577 MOVQ AX, (R9) 1578 RET 1579 1580 allsame: 1581 XORQ AX, AX 1582 XORQ CX, CX 1583 CMPQ BX, DX 1584 SETGT AX // 1 if alen > blen 1585 SETEQ CX // 1 if alen == blen 1586 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1587 MOVQ AX, (R9) 1588 RET 1589 1590 // this works for >= 64 bytes of data. 1591 big_loop: 1592 MOVOU (SI), X0 1593 MOVOU (DI), X1 1594 PCMPEQB X0, X1 1595 PMOVMSKB X1, AX 1596 XORQ $0xffff, AX 1597 JNE diff16 1598 1599 MOVOU 16(SI), X0 1600 MOVOU 16(DI), X1 1601 PCMPEQB X0, X1 1602 PMOVMSKB X1, AX 1603 XORQ $0xffff, AX 1604 JNE diff32 1605 1606 MOVOU 32(SI), X0 1607 MOVOU 32(DI), X1 1608 PCMPEQB X0, X1 1609 PMOVMSKB X1, AX 1610 XORQ $0xffff, AX 1611 JNE diff48 1612 1613 MOVOU 48(SI), X0 1614 MOVOU 48(DI), X1 1615 PCMPEQB X0, X1 1616 PMOVMSKB X1, AX 1617 XORQ $0xffff, AX 1618 JNE diff64 1619 1620 ADDQ $64, SI 1621 ADDQ $64, DI 1622 SUBQ $64, R8 1623 CMPQ R8, $64 1624 JBE loop 1625 JMP big_loop 1626 1627 // Compare 64-bytes per loop iteration. 1628 // Loop is unrolled and uses AVX2. 1629 big_loop_avx2: 1630 VMOVDQU (SI), Y2 1631 VMOVDQU (DI), Y3 1632 VMOVDQU 32(SI), Y4 1633 VMOVDQU 32(DI), Y5 1634 VPCMPEQB Y2, Y3, Y0 1635 VPMOVMSKB Y0, AX 1636 XORL $0xffffffff, AX 1637 JNE diff32_avx2 1638 VPCMPEQB Y4, Y5, Y6 1639 VPMOVMSKB Y6, AX 1640 XORL $0xffffffff, AX 1641 JNE diff64_avx2 1642 1643 ADDQ $64, SI 1644 ADDQ $64, DI 1645 SUBQ $64, R8 1646 CMPQ R8, $64 1647 JB big_loop_avx2_exit 1648 JMP big_loop_avx2 1649 1650 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 1651 diff32_avx2: 1652 VZEROUPPER 1653 JMP diff16 1654 1655 // Same as diff32_avx2, but for last 32 bytes. 1656 diff64_avx2: 1657 VZEROUPPER 1658 JMP diff48 1659 1660 // For <64 bytes remainder jump to normal loop. 1661 big_loop_avx2_exit: 1662 VZEROUPPER 1663 JMP loop 1664 1665 1666 // TODO: Also use this in bytes.Index 1667 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 1668 MOVQ s+0(FP), DI 1669 MOVQ s_len+8(FP), CX 1670 MOVQ c+16(FP), AX 1671 MOVQ c_len+24(FP), BX 1672 CMPQ BX, CX 1673 JA fail 1674 CMPQ BX, $2 1675 JA _3_or_more 1676 MOVW (AX), AX 1677 LEAQ -1(DI)(CX*1), CX 1678 loop2: 1679 MOVW (DI), SI 1680 CMPW SI,AX 1681 JZ success 1682 ADDQ $1,DI 1683 CMPQ DI,CX 1684 JB loop2 1685 JMP fail 1686 _3_or_more: 1687 CMPQ BX, $3 1688 JA _4_or_more 1689 MOVW 1(AX), DX 1690 MOVW (AX), AX 1691 LEAQ -2(DI)(CX*1), CX 1692 loop3: 1693 MOVW (DI), SI 1694 CMPW SI,AX 1695 JZ partial_success3 1696 ADDQ $1,DI 1697 CMPQ DI,CX 1698 JB loop3 1699 JMP fail 1700 partial_success3: 1701 MOVW 1(DI), SI 1702 CMPW SI,DX 1703 JZ success 1704 ADDQ $1,DI 1705 CMPQ DI,CX 1706 JB loop3 1707 JMP fail 1708 _4_or_more: 1709 CMPQ BX, $4 1710 JA _5_or_more 1711 MOVL (AX), AX 1712 LEAQ -3(DI)(CX*1), CX 1713 loop4: 1714 MOVL (DI), SI 1715 CMPL SI,AX 1716 JZ success 1717 ADDQ $1,DI 1718 CMPQ DI,CX 1719 JB loop4 1720 JMP fail 1721 _5_or_more: 1722 CMPQ BX, $7 1723 JA _8_or_more 1724 LEAQ 1(DI)(CX*1), CX 1725 SUBQ BX, CX 1726 MOVL -4(AX)(BX*1), DX 1727 MOVL (AX), AX 1728 loop5to7: 1729 MOVL (DI), SI 1730 CMPL SI,AX 1731 JZ partial_success5to7 1732 ADDQ $1,DI 1733 CMPQ DI,CX 1734 JB loop5to7 1735 JMP fail 1736 partial_success5to7: 1737 MOVL -4(BX)(DI*1), SI 1738 CMPL SI,DX 1739 JZ success 1740 ADDQ $1,DI 1741 CMPQ DI,CX 1742 JB loop5to7 1743 JMP fail 1744 _8_or_more: 1745 CMPQ BX, $8 1746 JA _9_or_more 1747 MOVQ (AX), AX 1748 LEAQ -7(DI)(CX*1), CX 1749 loop8: 1750 MOVQ (DI), SI 1751 CMPQ SI,AX 1752 JZ success 1753 ADDQ $1,DI 1754 CMPQ DI,CX 1755 JB loop8 1756 JMP fail 1757 _9_or_more: 1758 CMPQ BX, $16 1759 JA _16_or_more 1760 LEAQ 1(DI)(CX*1), CX 1761 SUBQ BX, CX 1762 MOVQ -8(AX)(BX*1), DX 1763 MOVQ (AX), AX 1764 loop9to15: 1765 MOVQ (DI), SI 1766 CMPQ SI,AX 1767 JZ partial_success9to15 1768 ADDQ $1,DI 1769 CMPQ DI,CX 1770 JB loop9to15 1771 JMP fail 1772 partial_success9to15: 1773 MOVQ -8(BX)(DI*1), SI 1774 CMPQ SI,DX 1775 JZ success 1776 ADDQ $1,DI 1777 CMPQ DI,CX 1778 JB loop9to15 1779 JMP fail 1780 _16_or_more: 1781 CMPQ BX, $16 1782 JA _17_to_31 1783 MOVOU (AX), X1 1784 LEAQ -15(DI)(CX*1), CX 1785 loop16: 1786 MOVOU (DI), X2 1787 PCMPEQB X1, X2 1788 PMOVMSKB X2, SI 1789 CMPQ SI, $0xffff 1790 JE success 1791 ADDQ $1,DI 1792 CMPQ DI,CX 1793 JB loop16 1794 JMP fail 1795 _17_to_31: 1796 LEAQ 1(DI)(CX*1), CX 1797 SUBQ BX, CX 1798 MOVOU -16(AX)(BX*1), X0 1799 MOVOU (AX), X1 1800 loop17to31: 1801 MOVOU (DI), X2 1802 PCMPEQB X1,X2 1803 PMOVMSKB X2, SI 1804 CMPQ SI, $0xffff 1805 JE partial_success17to31 1806 ADDQ $1,DI 1807 CMPQ DI,CX 1808 JB loop17to31 1809 JMP fail 1810 partial_success17to31: 1811 MOVOU -16(BX)(DI*1), X3 1812 PCMPEQB X0, X3 1813 PMOVMSKB X3, SI 1814 CMPQ SI, $0xffff 1815 JE success 1816 ADDQ $1,DI 1817 CMPQ DI,CX 1818 JB loop17to31 1819 fail: 1820 MOVQ $-1, ret+32(FP) 1821 RET 1822 success: 1823 SUBQ s+0(FP), DI 1824 MOVQ DI, ret+32(FP) 1825 RET 1826 1827 1828 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 1829 MOVQ s+0(FP), SI 1830 MOVQ s_len+8(FP), BX 1831 MOVB c+24(FP), AL 1832 LEAQ ret+32(FP), R8 1833 JMP runtime·indexbytebody(SB) 1834 1835 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 1836 MOVQ s+0(FP), SI 1837 MOVQ s_len+8(FP), BX 1838 MOVB c+16(FP), AL 1839 LEAQ ret+24(FP), R8 1840 JMP runtime·indexbytebody(SB) 1841 1842 // input: 1843 // SI: data 1844 // BX: data len 1845 // AL: byte sought 1846 // R8: address to put result 1847 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 1848 // Shuffle X0 around so that each byte contains 1849 // the character we're looking for. 1850 MOVD AX, X0 1851 PUNPCKLBW X0, X0 1852 PUNPCKLBW X0, X0 1853 PSHUFL $0, X0, X0 1854 1855 CMPQ BX, $16 1856 JLT small 1857 1858 MOVQ SI, DI 1859 1860 CMPQ BX, $32 1861 JA avx2 1862 sse: 1863 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 1864 JMP sseloopentry 1865 1866 sseloop: 1867 // Move the next 16-byte chunk of the data into X1. 1868 MOVOU (DI), X1 1869 // Compare bytes in X0 to X1. 1870 PCMPEQB X0, X1 1871 // Take the top bit of each byte in X1 and put the result in DX. 1872 PMOVMSKB X1, DX 1873 // Find first set bit, if any. 1874 BSFL DX, DX 1875 JNZ ssesuccess 1876 // Advance to next block. 1877 ADDQ $16, DI 1878 sseloopentry: 1879 CMPQ DI, AX 1880 JB sseloop 1881 1882 // Search the last 16-byte chunk. This chunk may overlap with the 1883 // chunks we've already searched, but that's ok. 1884 MOVQ AX, DI 1885 MOVOU (AX), X1 1886 PCMPEQB X0, X1 1887 PMOVMSKB X1, DX 1888 BSFL DX, DX 1889 JNZ ssesuccess 1890 1891 failure: 1892 MOVQ $-1, (R8) 1893 RET 1894 1895 // We've found a chunk containing the byte. 1896 // The chunk was loaded from DI. 1897 // The index of the matching byte in the chunk is DX. 1898 // The start of the data is SI. 1899 ssesuccess: 1900 SUBQ SI, DI // Compute offset of chunk within data. 1901 ADDQ DX, DI // Add offset of byte within chunk. 1902 MOVQ DI, (R8) 1903 RET 1904 1905 // handle for lengths < 16 1906 small: 1907 TESTQ BX, BX 1908 JEQ failure 1909 1910 // Check if we'll load across a page boundary. 1911 LEAQ 16(SI), AX 1912 TESTW $0xff0, AX 1913 JEQ endofpage 1914 1915 MOVOU (SI), X1 // Load data 1916 PCMPEQB X0, X1 // Compare target byte with each byte in data. 1917 PMOVMSKB X1, DX // Move result bits to integer register. 1918 BSFL DX, DX // Find first set bit. 1919 JZ failure // No set bit, failure. 1920 CMPL DX, BX 1921 JAE failure // Match is past end of data. 1922 MOVQ DX, (R8) 1923 RET 1924 1925 endofpage: 1926 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 1927 PCMPEQB X0, X1 // Compare target byte with each byte in data. 1928 PMOVMSKB X1, DX // Move result bits to integer register. 1929 MOVL BX, CX 1930 SHLL CX, DX 1931 SHRL $16, DX // Shift desired bits down to bottom of register. 1932 BSFL DX, DX // Find first set bit. 1933 JZ failure // No set bit, failure. 1934 MOVQ DX, (R8) 1935 RET 1936 1937 avx2: 1938 CMPB runtime·support_avx2(SB), $1 1939 JNE sse 1940 MOVD AX, X0 1941 LEAQ -32(SI)(BX*1), R11 1942 VPBROADCASTB X0, Y1 1943 avx2_loop: 1944 VMOVDQU (DI), Y2 1945 VPCMPEQB Y1, Y2, Y3 1946 VPTEST Y3, Y3 1947 JNZ avx2success 1948 ADDQ $32, DI 1949 CMPQ DI, R11 1950 JLT avx2_loop 1951 MOVQ R11, DI 1952 VMOVDQU (DI), Y2 1953 VPCMPEQB Y1, Y2, Y3 1954 VPTEST Y3, Y3 1955 JNZ avx2success 1956 VZEROUPPER 1957 MOVQ $-1, (R8) 1958 RET 1959 1960 avx2success: 1961 VPMOVMSKB Y3, DX 1962 BSFL DX, DX 1963 SUBQ SI, DI 1964 ADDQ DI, DX 1965 MOVQ DX, (R8) 1966 VZEROUPPER 1967 RET 1968 1969 TEXT bytes·Equal(SB),NOSPLIT,$0-49 1970 MOVQ a_len+8(FP), BX 1971 MOVQ b_len+32(FP), CX 1972 CMPQ BX, CX 1973 JNE eqret 1974 MOVQ a+0(FP), SI 1975 MOVQ b+24(FP), DI 1976 LEAQ ret+48(FP), AX 1977 JMP runtime·memeqbody(SB) 1978 eqret: 1979 MOVB $0, ret+48(FP) 1980 RET 1981 1982 TEXT runtime·fastrand1(SB), NOSPLIT, $0-4 1983 get_tls(CX) 1984 MOVQ g(CX), AX 1985 MOVQ g_m(AX), AX 1986 MOVL m_fastrand(AX), DX 1987 ADDL DX, DX 1988 MOVL DX, BX 1989 XORL $0x88888eef, DX 1990 CMOVLMI BX, DX 1991 MOVL DX, m_fastrand(AX) 1992 MOVL DX, ret+0(FP) 1993 RET 1994 1995 TEXT runtime·return0(SB), NOSPLIT, $0 1996 MOVL $0, AX 1997 RET 1998 1999 2000 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 2001 // Must obey the gcc calling convention. 2002 TEXT _cgo_topofstack(SB),NOSPLIT,$0 2003 get_tls(CX) 2004 MOVQ g(CX), AX 2005 MOVQ g_m(AX), AX 2006 MOVQ m_curg(AX), AX 2007 MOVQ (g_stack+stack_hi)(AX), AX 2008 RET 2009 2010 // The top-most function running on a goroutine 2011 // returns to goexit+PCQuantum. 2012 TEXT runtime·goexit(SB),NOSPLIT,$0-0 2013 BYTE $0x90 // NOP 2014 CALL runtime·goexit1(SB) // does not return 2015 // traceback from goexit1 must hit code range of goexit 2016 BYTE $0x90 // NOP 2017 2018 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8 2019 MOVQ addr+0(FP), AX 2020 PREFETCHT0 (AX) 2021 RET 2022 2023 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8 2024 MOVQ addr+0(FP), AX 2025 PREFETCHT1 (AX) 2026 RET 2027 2028 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8 2029 MOVQ addr+0(FP), AX 2030 PREFETCHT2 (AX) 2031 RET 2032 2033 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8 2034 MOVQ addr+0(FP), AX 2035 PREFETCHNTA (AX) 2036 RET 2037 2038 // This is called from .init_array and follows the platform, not Go, ABI. 2039 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 2040 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 2041 MOVQ runtime·lastmoduledatap(SB), AX 2042 MOVQ DI, moduledata_next(AX) 2043 MOVQ DI, runtime·lastmoduledatap(SB) 2044 POPQ R15 2045 RET