github.com/gocuntian/go@v0.0.0-20160610041250-fee02d270bf8/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVQ $0, AX 30 CPUID 31 MOVQ AX, SI 32 CMPQ AX, $0 33 JE nocpuinfo 34 35 // Figure out how to serialize RDTSC. 36 // On Intel processors LFENCE is enough. AMD requires MFENCE. 37 // Don't know about the rest, so let's do MFENCE. 38 CMPL BX, $0x756E6547 // "Genu" 39 JNE notintel 40 CMPL DX, $0x49656E69 // "ineI" 41 JNE notintel 42 CMPL CX, $0x6C65746E // "ntel" 43 JNE notintel 44 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 45 notintel: 46 47 // Load EAX=1 cpuid flags 48 MOVQ $1, AX 49 CPUID 50 MOVL CX, runtime·cpuid_ecx(SB) 51 MOVL DX, runtime·cpuid_edx(SB) 52 53 // Load EAX=7/ECX=0 cpuid flags 54 CMPQ SI, $7 55 JLT no7 56 MOVL $7, AX 57 MOVL $0, CX 58 CPUID 59 MOVL BX, runtime·cpuid_ebx7(SB) 60 no7: 61 // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1] 62 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 63 // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf 64 MOVL runtime·cpuid_ecx(SB), CX 65 ANDL $0x18000000, CX // check for OSXSAVE and AVX bits 66 CMPL CX, $0x18000000 67 JNE noavx 68 MOVL $0, CX 69 // For XGETBV, OSXSAVE bit is required and sufficient 70 XGETBV 71 ANDL $6, AX 72 CMPL AX, $6 // Check for OS support of YMM registers 73 JNE noavx 74 MOVB $1, runtime·support_avx(SB) 75 TESTL $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit 76 JEQ noavx2 77 MOVB $1, runtime·support_avx2(SB) 78 JMP nocpuinfo 79 noavx: 80 MOVB $0, runtime·support_avx(SB) 81 noavx2: 82 MOVB $0, runtime·support_avx2(SB) 83 nocpuinfo: 84 85 // if there is an _cgo_init, call it. 86 MOVQ _cgo_init(SB), AX 87 TESTQ AX, AX 88 JZ needtls 89 // g0 already in DI 90 MOVQ DI, CX // Win64 uses CX for first parameter 91 MOVQ $setg_gcc<>(SB), SI 92 CALL AX 93 94 // update stackguard after _cgo_init 95 MOVQ $runtime·g0(SB), CX 96 MOVQ (g_stack+stack_lo)(CX), AX 97 ADDQ $const__StackGuard, AX 98 MOVQ AX, g_stackguard0(CX) 99 MOVQ AX, g_stackguard1(CX) 100 101 #ifndef GOOS_windows 102 JMP ok 103 #endif 104 needtls: 105 #ifdef GOOS_plan9 106 // skip TLS setup on Plan 9 107 JMP ok 108 #endif 109 #ifdef GOOS_solaris 110 // skip TLS setup on Solaris 111 JMP ok 112 #endif 113 114 LEAQ runtime·m0+m_tls(SB), DI 115 CALL runtime·settls(SB) 116 117 // store through it, to make sure it works 118 get_tls(BX) 119 MOVQ $0x123, g(BX) 120 MOVQ runtime·m0+m_tls(SB), AX 121 CMPQ AX, $0x123 122 JEQ 2(PC) 123 MOVL AX, 0 // abort 124 ok: 125 // set the per-goroutine and per-mach "registers" 126 get_tls(BX) 127 LEAQ runtime·g0(SB), CX 128 MOVQ CX, g(BX) 129 LEAQ runtime·m0(SB), AX 130 131 // save m->g0 = g0 132 MOVQ CX, m_g0(AX) 133 // save m0 to g0->m 134 MOVQ AX, g_m(CX) 135 136 CLD // convention is D is always left cleared 137 CALL runtime·check(SB) 138 139 MOVL 16(SP), AX // copy argc 140 MOVL AX, 0(SP) 141 MOVQ 24(SP), AX // copy argv 142 MOVQ AX, 8(SP) 143 CALL runtime·args(SB) 144 CALL runtime·osinit(SB) 145 CALL runtime·schedinit(SB) 146 147 // create a new goroutine to start program 148 MOVQ $runtime·mainPC(SB), AX // entry 149 PUSHQ AX 150 PUSHQ $0 // arg size 151 CALL runtime·newproc(SB) 152 POPQ AX 153 POPQ AX 154 155 // start this M 156 CALL runtime·mstart(SB) 157 158 MOVL $0xf1, 0xf1 // crash 159 RET 160 161 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 162 GLOBL runtime·mainPC(SB),RODATA,$8 163 164 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 165 BYTE $0xcc 166 RET 167 168 TEXT runtime·asminit(SB),NOSPLIT,$0-0 169 // No per-thread init. 170 RET 171 172 /* 173 * go-routine 174 */ 175 176 // void gosave(Gobuf*) 177 // save state in Gobuf; setjmp 178 TEXT runtime·gosave(SB), NOSPLIT, $0-8 179 MOVQ buf+0(FP), AX // gobuf 180 LEAQ buf+0(FP), BX // caller's SP 181 MOVQ BX, gobuf_sp(AX) 182 MOVQ 0(SP), BX // caller's PC 183 MOVQ BX, gobuf_pc(AX) 184 MOVQ $0, gobuf_ret(AX) 185 MOVQ $0, gobuf_ctxt(AX) 186 MOVQ BP, gobuf_bp(AX) 187 get_tls(CX) 188 MOVQ g(CX), BX 189 MOVQ BX, gobuf_g(AX) 190 RET 191 192 // void gogo(Gobuf*) 193 // restore state from Gobuf; longjmp 194 TEXT runtime·gogo(SB), NOSPLIT, $0-8 195 MOVQ buf+0(FP), BX // gobuf 196 MOVQ gobuf_g(BX), DX 197 MOVQ 0(DX), CX // make sure g != nil 198 get_tls(CX) 199 MOVQ DX, g(CX) 200 MOVQ gobuf_sp(BX), SP // restore SP 201 MOVQ gobuf_ret(BX), AX 202 MOVQ gobuf_ctxt(BX), DX 203 MOVQ gobuf_bp(BX), BP 204 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 205 MOVQ $0, gobuf_ret(BX) 206 MOVQ $0, gobuf_ctxt(BX) 207 MOVQ $0, gobuf_bp(BX) 208 MOVQ gobuf_pc(BX), BX 209 JMP BX 210 211 // func mcall(fn func(*g)) 212 // Switch to m->g0's stack, call fn(g). 213 // Fn must never return. It should gogo(&g->sched) 214 // to keep running g. 215 TEXT runtime·mcall(SB), NOSPLIT, $0-8 216 MOVQ fn+0(FP), DI 217 218 get_tls(CX) 219 MOVQ g(CX), AX // save state in g->sched 220 MOVQ 0(SP), BX // caller's PC 221 MOVQ BX, (g_sched+gobuf_pc)(AX) 222 LEAQ fn+0(FP), BX // caller's SP 223 MOVQ BX, (g_sched+gobuf_sp)(AX) 224 MOVQ AX, (g_sched+gobuf_g)(AX) 225 MOVQ BP, (g_sched+gobuf_bp)(AX) 226 227 // switch to m->g0 & its stack, call fn 228 MOVQ g(CX), BX 229 MOVQ g_m(BX), BX 230 MOVQ m_g0(BX), SI 231 CMPQ SI, AX // if g == m->g0 call badmcall 232 JNE 3(PC) 233 MOVQ $runtime·badmcall(SB), AX 234 JMP AX 235 MOVQ SI, g(CX) // g = m->g0 236 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 237 PUSHQ AX 238 MOVQ DI, DX 239 MOVQ 0(DI), DI 240 CALL DI 241 POPQ AX 242 MOVQ $runtime·badmcall2(SB), AX 243 JMP AX 244 RET 245 246 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 247 // of the G stack. We need to distinguish the routine that 248 // lives at the bottom of the G stack from the one that lives 249 // at the top of the system stack because the one at the top of 250 // the system stack terminates the stack walk (see topofstack()). 251 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 252 RET 253 254 // func systemstack(fn func()) 255 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 256 MOVQ fn+0(FP), DI // DI = fn 257 get_tls(CX) 258 MOVQ g(CX), AX // AX = g 259 MOVQ g_m(AX), BX // BX = m 260 261 MOVQ m_gsignal(BX), DX // DX = gsignal 262 CMPQ AX, DX 263 JEQ noswitch 264 265 MOVQ m_g0(BX), DX // DX = g0 266 CMPQ AX, DX 267 JEQ noswitch 268 269 MOVQ m_curg(BX), R8 270 CMPQ AX, R8 271 JEQ switch 272 273 // Bad: g is not gsignal, not g0, not curg. What is it? 274 MOVQ $runtime·badsystemstack(SB), AX 275 CALL AX 276 277 switch: 278 // save our state in g->sched. Pretend to 279 // be systemstack_switch if the G stack is scanned. 280 MOVQ $runtime·systemstack_switch(SB), SI 281 MOVQ SI, (g_sched+gobuf_pc)(AX) 282 MOVQ SP, (g_sched+gobuf_sp)(AX) 283 MOVQ AX, (g_sched+gobuf_g)(AX) 284 MOVQ BP, (g_sched+gobuf_bp)(AX) 285 286 // switch to g0 287 MOVQ DX, g(CX) 288 MOVQ (g_sched+gobuf_sp)(DX), BX 289 // make it look like mstart called systemstack on g0, to stop traceback 290 SUBQ $8, BX 291 MOVQ $runtime·mstart(SB), DX 292 MOVQ DX, 0(BX) 293 MOVQ BX, SP 294 295 // call target function 296 MOVQ DI, DX 297 MOVQ 0(DI), DI 298 CALL DI 299 300 // switch back to g 301 get_tls(CX) 302 MOVQ g(CX), AX 303 MOVQ g_m(AX), BX 304 MOVQ m_curg(BX), AX 305 MOVQ AX, g(CX) 306 MOVQ (g_sched+gobuf_sp)(AX), SP 307 MOVQ $0, (g_sched+gobuf_sp)(AX) 308 RET 309 310 noswitch: 311 // already on m stack, just call directly 312 MOVQ DI, DX 313 MOVQ 0(DI), DI 314 CALL DI 315 RET 316 317 /* 318 * support for morestack 319 */ 320 321 // Called during function prolog when more stack is needed. 322 // 323 // The traceback routines see morestack on a g0 as being 324 // the top of a stack (for example, morestack calling newstack 325 // calling the scheduler calling newm calling gc), so we must 326 // record an argument size. For that purpose, it has no arguments. 327 TEXT runtime·morestack(SB),NOSPLIT,$0-0 328 // Cannot grow scheduler stack (m->g0). 329 get_tls(CX) 330 MOVQ g(CX), BX 331 MOVQ g_m(BX), BX 332 MOVQ m_g0(BX), SI 333 CMPQ g(CX), SI 334 JNE 2(PC) 335 INT $3 336 337 // Cannot grow signal stack (m->gsignal). 338 MOVQ m_gsignal(BX), SI 339 CMPQ g(CX), SI 340 JNE 2(PC) 341 INT $3 342 343 // Called from f. 344 // Set m->morebuf to f's caller. 345 MOVQ 8(SP), AX // f's caller's PC 346 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 347 LEAQ 16(SP), AX // f's caller's SP 348 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 349 get_tls(CX) 350 MOVQ g(CX), SI 351 MOVQ SI, (m_morebuf+gobuf_g)(BX) 352 353 // Set g->sched to context in f. 354 MOVQ 0(SP), AX // f's PC 355 MOVQ AX, (g_sched+gobuf_pc)(SI) 356 MOVQ SI, (g_sched+gobuf_g)(SI) 357 LEAQ 8(SP), AX // f's SP 358 MOVQ AX, (g_sched+gobuf_sp)(SI) 359 MOVQ DX, (g_sched+gobuf_ctxt)(SI) 360 MOVQ BP, (g_sched+gobuf_bp)(SI) 361 362 // Call newstack on m->g0's stack. 363 MOVQ m_g0(BX), BX 364 MOVQ BX, g(CX) 365 MOVQ (g_sched+gobuf_sp)(BX), SP 366 CALL runtime·newstack(SB) 367 MOVQ $0, 0x1003 // crash if newstack returns 368 RET 369 370 // morestack but not preserving ctxt. 371 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 372 MOVL $0, DX 373 JMP runtime·morestack(SB) 374 375 TEXT runtime·stackBarrier(SB),NOSPLIT,$0 376 // We came here via a RET to an overwritten return PC. 377 // AX may be live. Other registers are available. 378 379 // Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal. 380 get_tls(CX) 381 MOVQ g(CX), CX 382 MOVQ (g_stkbar+slice_array)(CX), DX 383 MOVQ g_stkbarPos(CX), BX 384 IMULQ $stkbar__size, BX // Too big for SIB. 385 MOVQ stkbar_savedLRPtr(DX)(BX*1), R8 386 MOVQ stkbar_savedLRVal(DX)(BX*1), BX 387 // Assert that we're popping the right saved LR. 388 ADDQ $8, R8 389 CMPQ R8, SP 390 JEQ 2(PC) 391 MOVL $0, 0 392 // Record that this stack barrier was hit. 393 ADDQ $1, g_stkbarPos(CX) 394 // Jump to the original return PC. 395 JMP BX 396 397 // reflectcall: call a function with the given argument list 398 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 399 // we don't have variable-sized frames, so we use a small number 400 // of constant-sized-frame functions to encode a few bits of size in the pc. 401 // Caution: ugly multiline assembly macros in your future! 402 403 #define DISPATCH(NAME,MAXSIZE) \ 404 CMPQ CX, $MAXSIZE; \ 405 JA 3(PC); \ 406 MOVQ $NAME(SB), AX; \ 407 JMP AX 408 // Note: can't just "JMP NAME(SB)" - bad inlining results. 409 410 TEXT reflect·call(SB), NOSPLIT, $0-0 411 JMP ·reflectcall(SB) 412 413 TEXT ·reflectcall(SB), NOSPLIT, $0-32 414 MOVLQZX argsize+24(FP), CX 415 // NOTE(rsc): No call16, because CALLFN needs four words 416 // of argument space to invoke callwritebarrier. 417 DISPATCH(runtime·call32, 32) 418 DISPATCH(runtime·call64, 64) 419 DISPATCH(runtime·call128, 128) 420 DISPATCH(runtime·call256, 256) 421 DISPATCH(runtime·call512, 512) 422 DISPATCH(runtime·call1024, 1024) 423 DISPATCH(runtime·call2048, 2048) 424 DISPATCH(runtime·call4096, 4096) 425 DISPATCH(runtime·call8192, 8192) 426 DISPATCH(runtime·call16384, 16384) 427 DISPATCH(runtime·call32768, 32768) 428 DISPATCH(runtime·call65536, 65536) 429 DISPATCH(runtime·call131072, 131072) 430 DISPATCH(runtime·call262144, 262144) 431 DISPATCH(runtime·call524288, 524288) 432 DISPATCH(runtime·call1048576, 1048576) 433 DISPATCH(runtime·call2097152, 2097152) 434 DISPATCH(runtime·call4194304, 4194304) 435 DISPATCH(runtime·call8388608, 8388608) 436 DISPATCH(runtime·call16777216, 16777216) 437 DISPATCH(runtime·call33554432, 33554432) 438 DISPATCH(runtime·call67108864, 67108864) 439 DISPATCH(runtime·call134217728, 134217728) 440 DISPATCH(runtime·call268435456, 268435456) 441 DISPATCH(runtime·call536870912, 536870912) 442 DISPATCH(runtime·call1073741824, 1073741824) 443 MOVQ $runtime·badreflectcall(SB), AX 444 JMP AX 445 446 #define CALLFN(NAME,MAXSIZE) \ 447 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 448 NO_LOCAL_POINTERS; \ 449 /* copy arguments to stack */ \ 450 MOVQ argptr+16(FP), SI; \ 451 MOVLQZX argsize+24(FP), CX; \ 452 MOVQ SP, DI; \ 453 REP;MOVSB; \ 454 /* call function */ \ 455 MOVQ f+8(FP), DX; \ 456 PCDATA $PCDATA_StackMapIndex, $0; \ 457 CALL (DX); \ 458 /* copy return values back */ \ 459 MOVQ argptr+16(FP), DI; \ 460 MOVLQZX argsize+24(FP), CX; \ 461 MOVLQZX retoffset+28(FP), BX; \ 462 MOVQ SP, SI; \ 463 ADDQ BX, DI; \ 464 ADDQ BX, SI; \ 465 SUBQ BX, CX; \ 466 REP;MOVSB; \ 467 /* execute write barrier updates */ \ 468 MOVQ argtype+0(FP), DX; \ 469 MOVQ argptr+16(FP), DI; \ 470 MOVLQZX argsize+24(FP), CX; \ 471 MOVLQZX retoffset+28(FP), BX; \ 472 MOVQ DX, 0(SP); \ 473 MOVQ DI, 8(SP); \ 474 MOVQ CX, 16(SP); \ 475 MOVQ BX, 24(SP); \ 476 CALL runtime·callwritebarrier(SB); \ 477 RET 478 479 CALLFN(·call32, 32) 480 CALLFN(·call64, 64) 481 CALLFN(·call128, 128) 482 CALLFN(·call256, 256) 483 CALLFN(·call512, 512) 484 CALLFN(·call1024, 1024) 485 CALLFN(·call2048, 2048) 486 CALLFN(·call4096, 4096) 487 CALLFN(·call8192, 8192) 488 CALLFN(·call16384, 16384) 489 CALLFN(·call32768, 32768) 490 CALLFN(·call65536, 65536) 491 CALLFN(·call131072, 131072) 492 CALLFN(·call262144, 262144) 493 CALLFN(·call524288, 524288) 494 CALLFN(·call1048576, 1048576) 495 CALLFN(·call2097152, 2097152) 496 CALLFN(·call4194304, 4194304) 497 CALLFN(·call8388608, 8388608) 498 CALLFN(·call16777216, 16777216) 499 CALLFN(·call33554432, 33554432) 500 CALLFN(·call67108864, 67108864) 501 CALLFN(·call134217728, 134217728) 502 CALLFN(·call268435456, 268435456) 503 CALLFN(·call536870912, 536870912) 504 CALLFN(·call1073741824, 1073741824) 505 506 TEXT runtime·procyield(SB),NOSPLIT,$0-0 507 MOVL cycles+0(FP), AX 508 again: 509 PAUSE 510 SUBL $1, AX 511 JNZ again 512 RET 513 514 515 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 516 // Stores are already ordered on x86, so this is just a 517 // compile barrier. 518 RET 519 520 // void jmpdefer(fn, sp); 521 // called from deferreturn. 522 // 1. pop the caller 523 // 2. sub 5 bytes from the callers return 524 // 3. jmp to the argument 525 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 526 MOVQ fv+0(FP), DX // fn 527 MOVQ argp+8(FP), BX // caller sp 528 LEAQ -8(BX), SP // caller sp after CALL 529 MOVQ -8(SP), BP // restore BP as if deferreturn returned (harmless if framepointers not in use) 530 SUBQ $5, (SP) // return to CALL again 531 MOVQ 0(DX), BX 532 JMP BX // but first run the deferred function 533 534 // Save state of caller into g->sched. Smashes R8, R9. 535 TEXT gosave<>(SB),NOSPLIT,$0 536 get_tls(R8) 537 MOVQ g(R8), R8 538 MOVQ 0(SP), R9 539 MOVQ R9, (g_sched+gobuf_pc)(R8) 540 LEAQ 8(SP), R9 541 MOVQ R9, (g_sched+gobuf_sp)(R8) 542 MOVQ $0, (g_sched+gobuf_ret)(R8) 543 MOVQ $0, (g_sched+gobuf_ctxt)(R8) 544 MOVQ BP, (g_sched+gobuf_bp)(R8) 545 RET 546 547 // func asmcgocall(fn, arg unsafe.Pointer) int32 548 // Call fn(arg) on the scheduler stack, 549 // aligned appropriately for the gcc ABI. 550 // See cgocall.go for more details. 551 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 552 MOVQ fn+0(FP), AX 553 MOVQ arg+8(FP), BX 554 555 MOVQ SP, DX 556 557 // Figure out if we need to switch to m->g0 stack. 558 // We get called to create new OS threads too, and those 559 // come in on the m->g0 stack already. 560 get_tls(CX) 561 MOVQ g(CX), R8 562 CMPQ R8, $0 563 JEQ nosave 564 MOVQ g_m(R8), R8 565 MOVQ m_g0(R8), SI 566 MOVQ g(CX), DI 567 CMPQ SI, DI 568 JEQ nosave 569 MOVQ m_gsignal(R8), SI 570 CMPQ SI, DI 571 JEQ nosave 572 573 // Switch to system stack. 574 MOVQ m_g0(R8), SI 575 CALL gosave<>(SB) 576 MOVQ SI, g(CX) 577 MOVQ (g_sched+gobuf_sp)(SI), SP 578 579 // Now on a scheduling stack (a pthread-created stack). 580 // Make sure we have enough room for 4 stack-backed fast-call 581 // registers as per windows amd64 calling convention. 582 SUBQ $64, SP 583 ANDQ $~15, SP // alignment for gcc ABI 584 MOVQ DI, 48(SP) // save g 585 MOVQ (g_stack+stack_hi)(DI), DI 586 SUBQ DX, DI 587 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 588 MOVQ BX, DI // DI = first argument in AMD64 ABI 589 MOVQ BX, CX // CX = first argument in Win64 590 CALL AX 591 592 // Restore registers, g, stack pointer. 593 get_tls(CX) 594 MOVQ 48(SP), DI 595 MOVQ (g_stack+stack_hi)(DI), SI 596 SUBQ 40(SP), SI 597 MOVQ DI, g(CX) 598 MOVQ SI, SP 599 600 MOVL AX, ret+16(FP) 601 RET 602 603 nosave: 604 // Running on a system stack, perhaps even without a g. 605 // Having no g can happen during thread creation or thread teardown 606 // (see needm/dropm on Solaris, for example). 607 // This code is like the above sequence but without saving/restoring g 608 // and without worrying about the stack moving out from under us 609 // (because we're on a system stack, not a goroutine stack). 610 // The above code could be used directly if already on a system stack, 611 // but then the only path through this code would be a rare case on Solaris. 612 // Using this code for all "already on system stack" calls exercises it more, 613 // which should help keep it correct. 614 SUBQ $64, SP 615 ANDQ $~15, SP 616 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging 617 MOVQ DX, 40(SP) // save original stack pointer 618 MOVQ BX, DI // DI = first argument in AMD64 ABI 619 MOVQ BX, CX // CX = first argument in Win64 620 CALL AX 621 MOVQ 40(SP), SI // restore original stack pointer 622 MOVQ SI, SP 623 MOVL AX, ret+16(FP) 624 RET 625 626 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt) 627 // Turn the fn into a Go func (by taking its address) and call 628 // cgocallback_gofunc. 629 TEXT runtime·cgocallback(SB),NOSPLIT,$32-32 630 LEAQ fn+0(FP), AX 631 MOVQ AX, 0(SP) 632 MOVQ frame+8(FP), AX 633 MOVQ AX, 8(SP) 634 MOVQ framesize+16(FP), AX 635 MOVQ AX, 16(SP) 636 MOVQ ctxt+24(FP), AX 637 MOVQ AX, 24(SP) 638 MOVQ $runtime·cgocallback_gofunc(SB), AX 639 CALL AX 640 RET 641 642 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt) 643 // See cgocall.go for more details. 644 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32 645 NO_LOCAL_POINTERS 646 647 // If g is nil, Go did not create the current thread. 648 // Call needm to obtain one m for temporary use. 649 // In this case, we're running on the thread stack, so there's 650 // lots of space, but the linker doesn't know. Hide the call from 651 // the linker analysis by using an indirect call through AX. 652 get_tls(CX) 653 #ifdef GOOS_windows 654 MOVL $0, BX 655 CMPQ CX, $0 656 JEQ 2(PC) 657 #endif 658 MOVQ g(CX), BX 659 CMPQ BX, $0 660 JEQ needm 661 MOVQ g_m(BX), BX 662 MOVQ BX, R8 // holds oldm until end of function 663 JMP havem 664 needm: 665 MOVQ $0, 0(SP) 666 MOVQ $runtime·needm(SB), AX 667 CALL AX 668 MOVQ 0(SP), R8 669 get_tls(CX) 670 MOVQ g(CX), BX 671 MOVQ g_m(BX), BX 672 673 // Set m->sched.sp = SP, so that if a panic happens 674 // during the function we are about to execute, it will 675 // have a valid SP to run on the g0 stack. 676 // The next few lines (after the havem label) 677 // will save this SP onto the stack and then write 678 // the same SP back to m->sched.sp. That seems redundant, 679 // but if an unrecovered panic happens, unwindm will 680 // restore the g->sched.sp from the stack location 681 // and then systemstack will try to use it. If we don't set it here, 682 // that restored SP will be uninitialized (typically 0) and 683 // will not be usable. 684 MOVQ m_g0(BX), SI 685 MOVQ SP, (g_sched+gobuf_sp)(SI) 686 687 havem: 688 // Now there's a valid m, and we're running on its m->g0. 689 // Save current m->g0->sched.sp on stack and then set it to SP. 690 // Save current sp in m->g0->sched.sp in preparation for 691 // switch back to m->curg stack. 692 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 693 MOVQ m_g0(BX), SI 694 MOVQ (g_sched+gobuf_sp)(SI), AX 695 MOVQ AX, 0(SP) 696 MOVQ SP, (g_sched+gobuf_sp)(SI) 697 698 // Switch to m->curg stack and call runtime.cgocallbackg. 699 // Because we are taking over the execution of m->curg 700 // but *not* resuming what had been running, we need to 701 // save that information (m->curg->sched) so we can restore it. 702 // We can restore m->curg->sched.sp easily, because calling 703 // runtime.cgocallbackg leaves SP unchanged upon return. 704 // To save m->curg->sched.pc, we push it onto the stack. 705 // This has the added benefit that it looks to the traceback 706 // routine like cgocallbackg is going to return to that 707 // PC (because the frame we allocate below has the same 708 // size as cgocallback_gofunc's frame declared above) 709 // so that the traceback will seamlessly trace back into 710 // the earlier calls. 711 // 712 // In the new goroutine, 8(SP) holds the saved R8. 713 MOVQ m_curg(BX), SI 714 MOVQ SI, g(CX) 715 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 716 MOVQ (g_sched+gobuf_pc)(SI), BX 717 MOVQ BX, -8(DI) 718 // Compute the size of the frame, including return PC and, if 719 // GOEXPERIMENT=framepointer, the saved based pointer 720 MOVQ ctxt+24(FP), BX 721 LEAQ fv+0(FP), AX 722 SUBQ SP, AX 723 SUBQ AX, DI 724 MOVQ DI, SP 725 726 MOVQ R8, 8(SP) 727 MOVQ BX, 0(SP) 728 CALL runtime·cgocallbackg(SB) 729 MOVQ 8(SP), R8 730 731 // Compute the size of the frame again. FP and SP have 732 // completely different values here than they did above, 733 // but only their difference matters. 734 LEAQ fv+0(FP), AX 735 SUBQ SP, AX 736 737 // Restore g->sched (== m->curg->sched) from saved values. 738 get_tls(CX) 739 MOVQ g(CX), SI 740 MOVQ SP, DI 741 ADDQ AX, DI 742 MOVQ -8(DI), BX 743 MOVQ BX, (g_sched+gobuf_pc)(SI) 744 MOVQ DI, (g_sched+gobuf_sp)(SI) 745 746 // Switch back to m->g0's stack and restore m->g0->sched.sp. 747 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 748 // so we do not have to restore it.) 749 MOVQ g(CX), BX 750 MOVQ g_m(BX), BX 751 MOVQ m_g0(BX), SI 752 MOVQ SI, g(CX) 753 MOVQ (g_sched+gobuf_sp)(SI), SP 754 MOVQ 0(SP), AX 755 MOVQ AX, (g_sched+gobuf_sp)(SI) 756 757 // If the m on entry was nil, we called needm above to borrow an m 758 // for the duration of the call. Since the call is over, return it with dropm. 759 CMPQ R8, $0 760 JNE 3(PC) 761 MOVQ $runtime·dropm(SB), AX 762 CALL AX 763 764 // Done! 765 RET 766 767 // void setg(G*); set g. for use by needm. 768 TEXT runtime·setg(SB), NOSPLIT, $0-8 769 MOVQ gg+0(FP), BX 770 #ifdef GOOS_windows 771 CMPQ BX, $0 772 JNE settls 773 MOVQ $0, 0x28(GS) 774 RET 775 settls: 776 MOVQ g_m(BX), AX 777 LEAQ m_tls(AX), AX 778 MOVQ AX, 0x28(GS) 779 #endif 780 get_tls(CX) 781 MOVQ BX, g(CX) 782 RET 783 784 // void setg_gcc(G*); set g called from gcc. 785 TEXT setg_gcc<>(SB),NOSPLIT,$0 786 get_tls(AX) 787 MOVQ DI, g(AX) 788 RET 789 790 // check that SP is in range [g->stack.lo, g->stack.hi) 791 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 792 get_tls(CX) 793 MOVQ g(CX), AX 794 CMPQ (g_stack+stack_hi)(AX), SP 795 JHI 2(PC) 796 INT $3 797 CMPQ SP, (g_stack+stack_lo)(AX) 798 JHI 2(PC) 799 INT $3 800 RET 801 802 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 803 MOVQ argp+0(FP),AX // addr of first arg 804 MOVQ -8(AX),AX // get calling pc 805 CMPQ AX, runtime·stackBarrierPC(SB) 806 JNE nobar 807 // Get original return PC. 808 CALL runtime·nextBarrierPC(SB) 809 MOVQ 0(SP), AX 810 nobar: 811 MOVQ AX, ret+8(FP) 812 RET 813 814 TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16 815 MOVQ argp+0(FP),AX // addr of first arg 816 MOVQ pc+8(FP), BX 817 MOVQ -8(AX), CX 818 CMPQ CX, runtime·stackBarrierPC(SB) 819 JEQ setbar 820 MOVQ BX, -8(AX) // set calling pc 821 RET 822 setbar: 823 // Set the stack barrier return PC. 824 MOVQ BX, 0(SP) 825 CALL runtime·setNextBarrierPC(SB) 826 RET 827 828 TEXT runtime·getcallersp(SB),NOSPLIT,$0-16 829 MOVQ argp+0(FP), AX 830 MOVQ AX, ret+8(FP) 831 RET 832 833 // func cputicks() int64 834 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 835 CMPB runtime·lfenceBeforeRdtsc(SB), $1 836 JNE mfence 837 LFENCE 838 JMP done 839 mfence: 840 MFENCE 841 done: 842 RDTSC 843 SHLQ $32, DX 844 ADDQ DX, AX 845 MOVQ AX, ret+0(FP) 846 RET 847 848 // memhash_varlen(p unsafe.Pointer, h seed) uintptr 849 // redirects to memhash(p, h, size) using the size 850 // stored in the closure. 851 TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24 852 GO_ARGS 853 NO_LOCAL_POINTERS 854 MOVQ p+0(FP), AX 855 MOVQ h+8(FP), BX 856 MOVQ 8(DX), CX 857 MOVQ AX, 0(SP) 858 MOVQ BX, 8(SP) 859 MOVQ CX, 16(SP) 860 CALL runtime·memhash(SB) 861 MOVQ 24(SP), AX 862 MOVQ AX, ret+16(FP) 863 RET 864 865 // hash function using AES hardware instructions 866 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 867 MOVQ p+0(FP), AX // ptr to data 868 MOVQ s+16(FP), CX // size 869 LEAQ ret+24(FP), DX 870 JMP runtime·aeshashbody(SB) 871 872 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 873 MOVQ p+0(FP), AX // ptr to string struct 874 MOVQ 8(AX), CX // length of string 875 MOVQ (AX), AX // string data 876 LEAQ ret+16(FP), DX 877 JMP runtime·aeshashbody(SB) 878 879 // AX: data 880 // CX: length 881 // DX: address to put return value 882 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 883 // Fill an SSE register with our seeds. 884 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 885 PINSRW $4, CX, X0 // 16 bits of length 886 PSHUFHW $0, X0, X0 // repeat length 4 times total 887 MOVO X0, X1 // save unscrambled seed 888 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 889 AESENC X0, X0 // scramble seed 890 891 CMPQ CX, $16 892 JB aes0to15 893 JE aes16 894 CMPQ CX, $32 895 JBE aes17to32 896 CMPQ CX, $64 897 JBE aes33to64 898 CMPQ CX, $128 899 JBE aes65to128 900 JMP aes129plus 901 902 aes0to15: 903 TESTQ CX, CX 904 JE aes0 905 906 ADDQ $16, AX 907 TESTW $0xff0, AX 908 JE endofpage 909 910 // 16 bytes loaded at this address won't cross 911 // a page boundary, so we can load it directly. 912 MOVOU -16(AX), X1 913 ADDQ CX, CX 914 MOVQ $masks<>(SB), AX 915 PAND (AX)(CX*8), X1 916 final1: 917 AESENC X0, X1 // scramble input, xor in seed 918 AESENC X1, X1 // scramble combo 2 times 919 AESENC X1, X1 920 MOVQ X1, (DX) 921 RET 922 923 endofpage: 924 // address ends in 1111xxxx. Might be up against 925 // a page boundary, so load ending at last byte. 926 // Then shift bytes down using pshufb. 927 MOVOU -32(AX)(CX*1), X1 928 ADDQ CX, CX 929 MOVQ $shifts<>(SB), AX 930 PSHUFB (AX)(CX*8), X1 931 JMP final1 932 933 aes0: 934 // Return scrambled input seed 935 AESENC X0, X0 936 MOVQ X0, (DX) 937 RET 938 939 aes16: 940 MOVOU (AX), X1 941 JMP final1 942 943 aes17to32: 944 // make second starting seed 945 PXOR runtime·aeskeysched+16(SB), X1 946 AESENC X1, X1 947 948 // load data to be hashed 949 MOVOU (AX), X2 950 MOVOU -16(AX)(CX*1), X3 951 952 // scramble 3 times 953 AESENC X0, X2 954 AESENC X1, X3 955 AESENC X2, X2 956 AESENC X3, X3 957 AESENC X2, X2 958 AESENC X3, X3 959 960 // combine results 961 PXOR X3, X2 962 MOVQ X2, (DX) 963 RET 964 965 aes33to64: 966 // make 3 more starting seeds 967 MOVO X1, X2 968 MOVO X1, X3 969 PXOR runtime·aeskeysched+16(SB), X1 970 PXOR runtime·aeskeysched+32(SB), X2 971 PXOR runtime·aeskeysched+48(SB), X3 972 AESENC X1, X1 973 AESENC X2, X2 974 AESENC X3, X3 975 976 MOVOU (AX), X4 977 MOVOU 16(AX), X5 978 MOVOU -32(AX)(CX*1), X6 979 MOVOU -16(AX)(CX*1), X7 980 981 AESENC X0, X4 982 AESENC X1, X5 983 AESENC X2, X6 984 AESENC X3, X7 985 986 AESENC X4, X4 987 AESENC X5, X5 988 AESENC X6, X6 989 AESENC X7, X7 990 991 AESENC X4, X4 992 AESENC X5, X5 993 AESENC X6, X6 994 AESENC X7, X7 995 996 PXOR X6, X4 997 PXOR X7, X5 998 PXOR X5, X4 999 MOVQ X4, (DX) 1000 RET 1001 1002 aes65to128: 1003 // make 7 more starting seeds 1004 MOVO X1, X2 1005 MOVO X1, X3 1006 MOVO X1, X4 1007 MOVO X1, X5 1008 MOVO X1, X6 1009 MOVO X1, X7 1010 PXOR runtime·aeskeysched+16(SB), X1 1011 PXOR runtime·aeskeysched+32(SB), X2 1012 PXOR runtime·aeskeysched+48(SB), X3 1013 PXOR runtime·aeskeysched+64(SB), X4 1014 PXOR runtime·aeskeysched+80(SB), X5 1015 PXOR runtime·aeskeysched+96(SB), X6 1016 PXOR runtime·aeskeysched+112(SB), X7 1017 AESENC X1, X1 1018 AESENC X2, X2 1019 AESENC X3, X3 1020 AESENC X4, X4 1021 AESENC X5, X5 1022 AESENC X6, X6 1023 AESENC X7, X7 1024 1025 // load data 1026 MOVOU (AX), X8 1027 MOVOU 16(AX), X9 1028 MOVOU 32(AX), X10 1029 MOVOU 48(AX), X11 1030 MOVOU -64(AX)(CX*1), X12 1031 MOVOU -48(AX)(CX*1), X13 1032 MOVOU -32(AX)(CX*1), X14 1033 MOVOU -16(AX)(CX*1), X15 1034 1035 // scramble data, xor in seed 1036 AESENC X0, X8 1037 AESENC X1, X9 1038 AESENC X2, X10 1039 AESENC X3, X11 1040 AESENC X4, X12 1041 AESENC X5, X13 1042 AESENC X6, X14 1043 AESENC X7, X15 1044 1045 // scramble twice 1046 AESENC X8, X8 1047 AESENC X9, X9 1048 AESENC X10, X10 1049 AESENC X11, X11 1050 AESENC X12, X12 1051 AESENC X13, X13 1052 AESENC X14, X14 1053 AESENC X15, X15 1054 1055 AESENC X8, X8 1056 AESENC X9, X9 1057 AESENC X10, X10 1058 AESENC X11, X11 1059 AESENC X12, X12 1060 AESENC X13, X13 1061 AESENC X14, X14 1062 AESENC X15, X15 1063 1064 // combine results 1065 PXOR X12, X8 1066 PXOR X13, X9 1067 PXOR X14, X10 1068 PXOR X15, X11 1069 PXOR X10, X8 1070 PXOR X11, X9 1071 PXOR X9, X8 1072 MOVQ X8, (DX) 1073 RET 1074 1075 aes129plus: 1076 // make 7 more starting seeds 1077 MOVO X1, X2 1078 MOVO X1, X3 1079 MOVO X1, X4 1080 MOVO X1, X5 1081 MOVO X1, X6 1082 MOVO X1, X7 1083 PXOR runtime·aeskeysched+16(SB), X1 1084 PXOR runtime·aeskeysched+32(SB), X2 1085 PXOR runtime·aeskeysched+48(SB), X3 1086 PXOR runtime·aeskeysched+64(SB), X4 1087 PXOR runtime·aeskeysched+80(SB), X5 1088 PXOR runtime·aeskeysched+96(SB), X6 1089 PXOR runtime·aeskeysched+112(SB), X7 1090 AESENC X1, X1 1091 AESENC X2, X2 1092 AESENC X3, X3 1093 AESENC X4, X4 1094 AESENC X5, X5 1095 AESENC X6, X6 1096 AESENC X7, X7 1097 1098 // start with last (possibly overlapping) block 1099 MOVOU -128(AX)(CX*1), X8 1100 MOVOU -112(AX)(CX*1), X9 1101 MOVOU -96(AX)(CX*1), X10 1102 MOVOU -80(AX)(CX*1), X11 1103 MOVOU -64(AX)(CX*1), X12 1104 MOVOU -48(AX)(CX*1), X13 1105 MOVOU -32(AX)(CX*1), X14 1106 MOVOU -16(AX)(CX*1), X15 1107 1108 // scramble input once, xor in seed 1109 AESENC X0, X8 1110 AESENC X1, X9 1111 AESENC X2, X10 1112 AESENC X3, X11 1113 AESENC X4, X12 1114 AESENC X5, X13 1115 AESENC X6, X14 1116 AESENC X7, X15 1117 1118 // compute number of remaining 128-byte blocks 1119 DECQ CX 1120 SHRQ $7, CX 1121 1122 aesloop: 1123 // scramble state, xor in a block 1124 MOVOU (AX), X0 1125 MOVOU 16(AX), X1 1126 MOVOU 32(AX), X2 1127 MOVOU 48(AX), X3 1128 AESENC X0, X8 1129 AESENC X1, X9 1130 AESENC X2, X10 1131 AESENC X3, X11 1132 MOVOU 64(AX), X4 1133 MOVOU 80(AX), X5 1134 MOVOU 96(AX), X6 1135 MOVOU 112(AX), X7 1136 AESENC X4, X12 1137 AESENC X5, X13 1138 AESENC X6, X14 1139 AESENC X7, X15 1140 1141 // scramble state 1142 AESENC X8, X8 1143 AESENC X9, X9 1144 AESENC X10, X10 1145 AESENC X11, X11 1146 AESENC X12, X12 1147 AESENC X13, X13 1148 AESENC X14, X14 1149 AESENC X15, X15 1150 1151 ADDQ $128, AX 1152 DECQ CX 1153 JNE aesloop 1154 1155 // 2 more scrambles to finish 1156 AESENC X8, X8 1157 AESENC X9, X9 1158 AESENC X10, X10 1159 AESENC X11, X11 1160 AESENC X12, X12 1161 AESENC X13, X13 1162 AESENC X14, X14 1163 AESENC X15, X15 1164 AESENC X8, X8 1165 AESENC X9, X9 1166 AESENC X10, X10 1167 AESENC X11, X11 1168 AESENC X12, X12 1169 AESENC X13, X13 1170 AESENC X14, X14 1171 AESENC X15, X15 1172 1173 PXOR X12, X8 1174 PXOR X13, X9 1175 PXOR X14, X10 1176 PXOR X15, X11 1177 PXOR X10, X8 1178 PXOR X11, X9 1179 PXOR X9, X8 1180 MOVQ X8, (DX) 1181 RET 1182 1183 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1184 MOVQ p+0(FP), AX // ptr to data 1185 MOVQ h+8(FP), X0 // seed 1186 PINSRD $2, (AX), X0 // data 1187 AESENC runtime·aeskeysched+0(SB), X0 1188 AESENC runtime·aeskeysched+16(SB), X0 1189 AESENC runtime·aeskeysched+32(SB), X0 1190 MOVQ X0, ret+16(FP) 1191 RET 1192 1193 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1194 MOVQ p+0(FP), AX // ptr to data 1195 MOVQ h+8(FP), X0 // seed 1196 PINSRQ $1, (AX), X0 // data 1197 AESENC runtime·aeskeysched+0(SB), X0 1198 AESENC runtime·aeskeysched+16(SB), X0 1199 AESENC runtime·aeskeysched+32(SB), X0 1200 MOVQ X0, ret+16(FP) 1201 RET 1202 1203 // simple mask to get rid of data in the high part of the register. 1204 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1205 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1206 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1207 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1208 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1209 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1210 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1211 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1212 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1213 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1214 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1215 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1216 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1217 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1218 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1219 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1220 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1221 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1222 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1223 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1224 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1225 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1226 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1227 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1228 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1229 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1230 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1231 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1232 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1233 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1234 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1235 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1236 GLOBL masks<>(SB),RODATA,$256 1237 1238 TEXT ·checkASM(SB),NOSPLIT,$0-1 1239 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte 1240 MOVQ $masks<>(SB), AX 1241 MOVQ $shifts<>(SB), BX 1242 ORQ BX, AX 1243 TESTQ $15, AX 1244 SETEQ ret+0(FP) 1245 RET 1246 1247 // these are arguments to pshufb. They move data down from 1248 // the high bytes of the register to the low bytes of the register. 1249 // index is how many bytes to move. 1250 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1251 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1252 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1253 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1254 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1255 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1256 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1257 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1258 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1259 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1260 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1261 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1262 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1263 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1264 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1265 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1266 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1267 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1268 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1269 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1270 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1271 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1272 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1273 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1274 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1275 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1276 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1277 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1278 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1279 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1280 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1281 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1282 GLOBL shifts<>(SB),RODATA,$256 1283 1284 // memequal(p, q unsafe.Pointer, size uintptr) bool 1285 TEXT runtime·memequal(SB),NOSPLIT,$0-25 1286 MOVQ a+0(FP), SI 1287 MOVQ b+8(FP), DI 1288 CMPQ SI, DI 1289 JEQ eq 1290 MOVQ size+16(FP), BX 1291 LEAQ ret+24(FP), AX 1292 JMP runtime·memeqbody(SB) 1293 eq: 1294 MOVB $1, ret+24(FP) 1295 RET 1296 1297 // memequal_varlen(a, b unsafe.Pointer) bool 1298 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1299 MOVQ a+0(FP), SI 1300 MOVQ b+8(FP), DI 1301 CMPQ SI, DI 1302 JEQ eq 1303 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1304 LEAQ ret+16(FP), AX 1305 JMP runtime·memeqbody(SB) 1306 eq: 1307 MOVB $1, ret+16(FP) 1308 RET 1309 1310 // eqstring tests whether two strings are equal. 1311 // The compiler guarantees that strings passed 1312 // to eqstring have equal length. 1313 // See runtime_test.go:eqstring_generic for 1314 // equivalent Go code. 1315 TEXT runtime·eqstring(SB),NOSPLIT,$0-33 1316 MOVQ s1str+0(FP), SI 1317 MOVQ s2str+16(FP), DI 1318 CMPQ SI, DI 1319 JEQ eq 1320 MOVQ s1len+8(FP), BX 1321 LEAQ v+32(FP), AX 1322 JMP runtime·memeqbody(SB) 1323 eq: 1324 MOVB $1, v+32(FP) 1325 RET 1326 1327 // a in SI 1328 // b in DI 1329 // count in BX 1330 // address of result byte in AX 1331 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1332 CMPQ BX, $8 1333 JB small 1334 CMPQ BX, $64 1335 JB bigloop 1336 CMPB runtime·support_avx2(SB), $1 1337 JE hugeloop_avx2 1338 1339 // 64 bytes at a time using xmm registers 1340 hugeloop: 1341 CMPQ BX, $64 1342 JB bigloop 1343 MOVOU (SI), X0 1344 MOVOU (DI), X1 1345 MOVOU 16(SI), X2 1346 MOVOU 16(DI), X3 1347 MOVOU 32(SI), X4 1348 MOVOU 32(DI), X5 1349 MOVOU 48(SI), X6 1350 MOVOU 48(DI), X7 1351 PCMPEQB X1, X0 1352 PCMPEQB X3, X2 1353 PCMPEQB X5, X4 1354 PCMPEQB X7, X6 1355 PAND X2, X0 1356 PAND X6, X4 1357 PAND X4, X0 1358 PMOVMSKB X0, DX 1359 ADDQ $64, SI 1360 ADDQ $64, DI 1361 SUBQ $64, BX 1362 CMPL DX, $0xffff 1363 JEQ hugeloop 1364 MOVB $0, (AX) 1365 RET 1366 1367 // 64 bytes at a time using ymm registers 1368 hugeloop_avx2: 1369 CMPQ BX, $64 1370 JB bigloop_avx2 1371 VMOVDQU (SI), Y0 1372 VMOVDQU (DI), Y1 1373 VMOVDQU 32(SI), Y2 1374 VMOVDQU 32(DI), Y3 1375 VPCMPEQB Y1, Y0, Y4 1376 VPCMPEQB Y2, Y3, Y5 1377 VPAND Y4, Y5, Y6 1378 VPMOVMSKB Y6, DX 1379 ADDQ $64, SI 1380 ADDQ $64, DI 1381 SUBQ $64, BX 1382 CMPL DX, $0xffffffff 1383 JEQ hugeloop_avx2 1384 VZEROUPPER 1385 MOVB $0, (AX) 1386 RET 1387 1388 bigloop_avx2: 1389 VZEROUPPER 1390 1391 // 8 bytes at a time using 64-bit register 1392 bigloop: 1393 CMPQ BX, $8 1394 JBE leftover 1395 MOVQ (SI), CX 1396 MOVQ (DI), DX 1397 ADDQ $8, SI 1398 ADDQ $8, DI 1399 SUBQ $8, BX 1400 CMPQ CX, DX 1401 JEQ bigloop 1402 MOVB $0, (AX) 1403 RET 1404 1405 // remaining 0-8 bytes 1406 leftover: 1407 MOVQ -8(SI)(BX*1), CX 1408 MOVQ -8(DI)(BX*1), DX 1409 CMPQ CX, DX 1410 SETEQ (AX) 1411 RET 1412 1413 small: 1414 CMPQ BX, $0 1415 JEQ equal 1416 1417 LEAQ 0(BX*8), CX 1418 NEGQ CX 1419 1420 CMPB SI, $0xf8 1421 JA si_high 1422 1423 // load at SI won't cross a page boundary. 1424 MOVQ (SI), SI 1425 JMP si_finish 1426 si_high: 1427 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1428 MOVQ -8(SI)(BX*1), SI 1429 SHRQ CX, SI 1430 si_finish: 1431 1432 // same for DI. 1433 CMPB DI, $0xf8 1434 JA di_high 1435 MOVQ (DI), DI 1436 JMP di_finish 1437 di_high: 1438 MOVQ -8(DI)(BX*1), DI 1439 SHRQ CX, DI 1440 di_finish: 1441 1442 SUBQ SI, DI 1443 SHLQ CX, DI 1444 equal: 1445 SETEQ (AX) 1446 RET 1447 1448 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1449 MOVQ s1_base+0(FP), SI 1450 MOVQ s1_len+8(FP), BX 1451 MOVQ s2_base+16(FP), DI 1452 MOVQ s2_len+24(FP), DX 1453 LEAQ ret+32(FP), R9 1454 JMP runtime·cmpbody(SB) 1455 1456 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1457 MOVQ s1+0(FP), SI 1458 MOVQ s1+8(FP), BX 1459 MOVQ s2+24(FP), DI 1460 MOVQ s2+32(FP), DX 1461 LEAQ res+48(FP), R9 1462 JMP runtime·cmpbody(SB) 1463 1464 // input: 1465 // SI = a 1466 // DI = b 1467 // BX = alen 1468 // DX = blen 1469 // R9 = address of output word (stores -1/0/1 here) 1470 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1471 CMPQ SI, DI 1472 JEQ allsame 1473 CMPQ BX, DX 1474 MOVQ DX, R8 1475 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1476 CMPQ R8, $8 1477 JB small 1478 1479 CMPQ R8, $63 1480 JBE loop 1481 CMPB runtime·support_avx2(SB), $1 1482 JEQ big_loop_avx2 1483 JMP big_loop 1484 loop: 1485 CMPQ R8, $16 1486 JBE _0through16 1487 MOVOU (SI), X0 1488 MOVOU (DI), X1 1489 PCMPEQB X0, X1 1490 PMOVMSKB X1, AX 1491 XORQ $0xffff, AX // convert EQ to NE 1492 JNE diff16 // branch if at least one byte is not equal 1493 ADDQ $16, SI 1494 ADDQ $16, DI 1495 SUBQ $16, R8 1496 JMP loop 1497 1498 diff64: 1499 ADDQ $48, SI 1500 ADDQ $48, DI 1501 JMP diff16 1502 diff48: 1503 ADDQ $32, SI 1504 ADDQ $32, DI 1505 JMP diff16 1506 diff32: 1507 ADDQ $16, SI 1508 ADDQ $16, DI 1509 // AX = bit mask of differences 1510 diff16: 1511 BSFQ AX, BX // index of first byte that differs 1512 XORQ AX, AX 1513 MOVB (SI)(BX*1), CX 1514 CMPB CX, (DI)(BX*1) 1515 SETHI AX 1516 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1517 MOVQ AX, (R9) 1518 RET 1519 1520 // 0 through 16 bytes left, alen>=8, blen>=8 1521 _0through16: 1522 CMPQ R8, $8 1523 JBE _0through8 1524 MOVQ (SI), AX 1525 MOVQ (DI), CX 1526 CMPQ AX, CX 1527 JNE diff8 1528 _0through8: 1529 MOVQ -8(SI)(R8*1), AX 1530 MOVQ -8(DI)(R8*1), CX 1531 CMPQ AX, CX 1532 JEQ allsame 1533 1534 // AX and CX contain parts of a and b that differ. 1535 diff8: 1536 BSWAPQ AX // reverse order of bytes 1537 BSWAPQ CX 1538 XORQ AX, CX 1539 BSRQ CX, CX // index of highest bit difference 1540 SHRQ CX, AX // move a's bit to bottom 1541 ANDQ $1, AX // mask bit 1542 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1543 MOVQ AX, (R9) 1544 RET 1545 1546 // 0-7 bytes in common 1547 small: 1548 LEAQ (R8*8), CX // bytes left -> bits left 1549 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1550 JEQ allsame 1551 1552 // load bytes of a into high bytes of AX 1553 CMPB SI, $0xf8 1554 JA si_high 1555 MOVQ (SI), SI 1556 JMP si_finish 1557 si_high: 1558 MOVQ -8(SI)(R8*1), SI 1559 SHRQ CX, SI 1560 si_finish: 1561 SHLQ CX, SI 1562 1563 // load bytes of b in to high bytes of BX 1564 CMPB DI, $0xf8 1565 JA di_high 1566 MOVQ (DI), DI 1567 JMP di_finish 1568 di_high: 1569 MOVQ -8(DI)(R8*1), DI 1570 SHRQ CX, DI 1571 di_finish: 1572 SHLQ CX, DI 1573 1574 BSWAPQ SI // reverse order of bytes 1575 BSWAPQ DI 1576 XORQ SI, DI // find bit differences 1577 JEQ allsame 1578 BSRQ DI, CX // index of highest bit difference 1579 SHRQ CX, SI // move a's bit to bottom 1580 ANDQ $1, SI // mask bit 1581 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1582 MOVQ AX, (R9) 1583 RET 1584 1585 allsame: 1586 XORQ AX, AX 1587 XORQ CX, CX 1588 CMPQ BX, DX 1589 SETGT AX // 1 if alen > blen 1590 SETEQ CX // 1 if alen == blen 1591 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1592 MOVQ AX, (R9) 1593 RET 1594 1595 // this works for >= 64 bytes of data. 1596 big_loop: 1597 MOVOU (SI), X0 1598 MOVOU (DI), X1 1599 PCMPEQB X0, X1 1600 PMOVMSKB X1, AX 1601 XORQ $0xffff, AX 1602 JNE diff16 1603 1604 MOVOU 16(SI), X0 1605 MOVOU 16(DI), X1 1606 PCMPEQB X0, X1 1607 PMOVMSKB X1, AX 1608 XORQ $0xffff, AX 1609 JNE diff32 1610 1611 MOVOU 32(SI), X0 1612 MOVOU 32(DI), X1 1613 PCMPEQB X0, X1 1614 PMOVMSKB X1, AX 1615 XORQ $0xffff, AX 1616 JNE diff48 1617 1618 MOVOU 48(SI), X0 1619 MOVOU 48(DI), X1 1620 PCMPEQB X0, X1 1621 PMOVMSKB X1, AX 1622 XORQ $0xffff, AX 1623 JNE diff64 1624 1625 ADDQ $64, SI 1626 ADDQ $64, DI 1627 SUBQ $64, R8 1628 CMPQ R8, $64 1629 JBE loop 1630 JMP big_loop 1631 1632 // Compare 64-bytes per loop iteration. 1633 // Loop is unrolled and uses AVX2. 1634 big_loop_avx2: 1635 VMOVDQU (SI), Y2 1636 VMOVDQU (DI), Y3 1637 VMOVDQU 32(SI), Y4 1638 VMOVDQU 32(DI), Y5 1639 VPCMPEQB Y2, Y3, Y0 1640 VPMOVMSKB Y0, AX 1641 XORL $0xffffffff, AX 1642 JNE diff32_avx2 1643 VPCMPEQB Y4, Y5, Y6 1644 VPMOVMSKB Y6, AX 1645 XORL $0xffffffff, AX 1646 JNE diff64_avx2 1647 1648 ADDQ $64, SI 1649 ADDQ $64, DI 1650 SUBQ $64, R8 1651 CMPQ R8, $64 1652 JB big_loop_avx2_exit 1653 JMP big_loop_avx2 1654 1655 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 1656 diff32_avx2: 1657 VZEROUPPER 1658 JMP diff16 1659 1660 // Same as diff32_avx2, but for last 32 bytes. 1661 diff64_avx2: 1662 VZEROUPPER 1663 JMP diff48 1664 1665 // For <64 bytes remainder jump to normal loop. 1666 big_loop_avx2_exit: 1667 VZEROUPPER 1668 JMP loop 1669 1670 1671 // TODO: Also use this in bytes.Index 1672 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 1673 MOVQ s+0(FP), DI 1674 // We want len in DX and AX, because PCMPESTRI implicitly consumes them 1675 MOVQ s_len+8(FP), DX 1676 MOVQ c+16(FP), BP 1677 MOVQ c_len+24(FP), AX 1678 CMPQ AX, DX 1679 JA fail 1680 CMPQ DX, $16 1681 JAE sse42 1682 no_sse42: 1683 CMPQ AX, $2 1684 JA _3_or_more 1685 MOVW (BP), BP 1686 LEAQ -1(DI)(DX*1), DX 1687 loop2: 1688 MOVW (DI), SI 1689 CMPW SI,BP 1690 JZ success 1691 ADDQ $1,DI 1692 CMPQ DI,DX 1693 JB loop2 1694 JMP fail 1695 _3_or_more: 1696 CMPQ AX, $3 1697 JA _4_or_more 1698 MOVW 1(BP), BX 1699 MOVW (BP), BP 1700 LEAQ -2(DI)(DX*1), DX 1701 loop3: 1702 MOVW (DI), SI 1703 CMPW SI,BP 1704 JZ partial_success3 1705 ADDQ $1,DI 1706 CMPQ DI,DX 1707 JB loop3 1708 JMP fail 1709 partial_success3: 1710 MOVW 1(DI), SI 1711 CMPW SI,BX 1712 JZ success 1713 ADDQ $1,DI 1714 CMPQ DI,DX 1715 JB loop3 1716 JMP fail 1717 _4_or_more: 1718 CMPQ AX, $4 1719 JA _5_or_more 1720 MOVL (BP), BP 1721 LEAQ -3(DI)(DX*1), DX 1722 loop4: 1723 MOVL (DI), SI 1724 CMPL SI,BP 1725 JZ success 1726 ADDQ $1,DI 1727 CMPQ DI,DX 1728 JB loop4 1729 JMP fail 1730 _5_or_more: 1731 CMPQ AX, $7 1732 JA _8_or_more 1733 LEAQ 1(DI)(DX*1), DX 1734 SUBQ AX, DX 1735 MOVL -4(BP)(AX*1), BX 1736 MOVL (BP), BP 1737 loop5to7: 1738 MOVL (DI), SI 1739 CMPL SI,BP 1740 JZ partial_success5to7 1741 ADDQ $1,DI 1742 CMPQ DI,DX 1743 JB loop5to7 1744 JMP fail 1745 partial_success5to7: 1746 MOVL -4(AX)(DI*1), SI 1747 CMPL SI,BX 1748 JZ success 1749 ADDQ $1,DI 1750 CMPQ DI,DX 1751 JB loop5to7 1752 JMP fail 1753 _8_or_more: 1754 CMPQ AX, $8 1755 JA _9_or_more 1756 MOVQ (BP), BP 1757 LEAQ -7(DI)(DX*1), DX 1758 loop8: 1759 MOVQ (DI), SI 1760 CMPQ SI,BP 1761 JZ success 1762 ADDQ $1,DI 1763 CMPQ DI,DX 1764 JB loop8 1765 JMP fail 1766 _9_or_more: 1767 CMPQ AX, $16 1768 JA _16_or_more 1769 LEAQ 1(DI)(DX*1), DX 1770 SUBQ AX, DX 1771 MOVQ -8(BP)(AX*1), BX 1772 MOVQ (BP), BP 1773 loop9to15: 1774 MOVQ (DI), SI 1775 CMPQ SI,BP 1776 JZ partial_success9to15 1777 ADDQ $1,DI 1778 CMPQ DI,DX 1779 JB loop9to15 1780 JMP fail 1781 partial_success9to15: 1782 MOVQ -8(AX)(DI*1), SI 1783 CMPQ SI,BX 1784 JZ success 1785 ADDQ $1,DI 1786 CMPQ DI,DX 1787 JB loop9to15 1788 JMP fail 1789 _16_or_more: 1790 CMPQ AX, $16 1791 JA _17_to_31 1792 MOVOU (BP), X1 1793 LEAQ -15(DI)(DX*1), DX 1794 loop16: 1795 MOVOU (DI), X2 1796 PCMPEQB X1, X2 1797 PMOVMSKB X2, SI 1798 CMPQ SI, $0xffff 1799 JE success 1800 ADDQ $1,DI 1801 CMPQ DI,DX 1802 JB loop16 1803 JMP fail 1804 _17_to_31: 1805 LEAQ 1(DI)(DX*1), DX 1806 SUBQ AX, DX 1807 MOVOU -16(BP)(AX*1), X0 1808 MOVOU (BP), X1 1809 loop17to31: 1810 MOVOU (DI), X2 1811 PCMPEQB X1,X2 1812 PMOVMSKB X2, SI 1813 CMPQ SI, $0xffff 1814 JE partial_success17to31 1815 ADDQ $1,DI 1816 CMPQ DI,DX 1817 JB loop17to31 1818 JMP fail 1819 partial_success17to31: 1820 MOVOU -16(AX)(DI*1), X3 1821 PCMPEQB X0, X3 1822 PMOVMSKB X3, SI 1823 CMPQ SI, $0xffff 1824 JE success 1825 ADDQ $1,DI 1826 CMPQ DI,DX 1827 JB loop17to31 1828 fail: 1829 MOVQ $-1, ret+32(FP) 1830 RET 1831 sse42: 1832 MOVL runtime·cpuid_ecx(SB), CX 1833 ANDL $0x100000, CX 1834 JZ no_sse42 1835 CMPQ AX, $12 1836 // PCMPESTRI is slower than normal compare, 1837 // so using it makes sense only if we advance 4+ bytes per compare 1838 // This value was determined experimentally and is the ~same 1839 // on Nehalem (first with SSE42) and Haswell. 1840 JAE _9_or_more 1841 LEAQ 16(BP), SI 1842 TESTW $0xff0, SI 1843 JEQ no_sse42 1844 MOVOU (BP), X1 1845 LEAQ -15(DI)(DX*1), SI 1846 MOVQ $16, R9 1847 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 1848 loop_sse42: 1849 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 1850 // for equality (bits 2,3 are 11) 1851 // result is not masked or inverted (bits 4,5 are 00) 1852 // and corresponds to first matching byte (bit 6 is 0) 1853 PCMPESTRI $0x0c, (DI), X1 1854 // CX == 16 means no match, 1855 // CX > R9 means partial match at the end of the string, 1856 // otherwise sep is at offset CX from X1 start 1857 CMPQ CX, R9 1858 JBE sse42_success 1859 ADDQ R9, DI 1860 CMPQ DI, SI 1861 JB loop_sse42 1862 PCMPESTRI $0x0c, -1(SI), X1 1863 CMPQ CX, R9 1864 JA fail 1865 LEAQ -1(SI), DI 1866 sse42_success: 1867 ADDQ CX, DI 1868 success: 1869 SUBQ s+0(FP), DI 1870 MOVQ DI, ret+32(FP) 1871 RET 1872 1873 1874 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 1875 MOVQ s+0(FP), SI 1876 MOVQ s_len+8(FP), BX 1877 MOVB c+24(FP), AL 1878 LEAQ ret+32(FP), R8 1879 JMP runtime·indexbytebody(SB) 1880 1881 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 1882 MOVQ s+0(FP), SI 1883 MOVQ s_len+8(FP), BX 1884 MOVB c+16(FP), AL 1885 LEAQ ret+24(FP), R8 1886 JMP runtime·indexbytebody(SB) 1887 1888 // input: 1889 // SI: data 1890 // BX: data len 1891 // AL: byte sought 1892 // R8: address to put result 1893 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 1894 // Shuffle X0 around so that each byte contains 1895 // the character we're looking for. 1896 MOVD AX, X0 1897 PUNPCKLBW X0, X0 1898 PUNPCKLBW X0, X0 1899 PSHUFL $0, X0, X0 1900 1901 CMPQ BX, $16 1902 JLT small 1903 1904 MOVQ SI, DI 1905 1906 CMPQ BX, $32 1907 JA avx2 1908 sse: 1909 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 1910 JMP sseloopentry 1911 1912 sseloop: 1913 // Move the next 16-byte chunk of the data into X1. 1914 MOVOU (DI), X1 1915 // Compare bytes in X0 to X1. 1916 PCMPEQB X0, X1 1917 // Take the top bit of each byte in X1 and put the result in DX. 1918 PMOVMSKB X1, DX 1919 // Find first set bit, if any. 1920 BSFL DX, DX 1921 JNZ ssesuccess 1922 // Advance to next block. 1923 ADDQ $16, DI 1924 sseloopentry: 1925 CMPQ DI, AX 1926 JB sseloop 1927 1928 // Search the last 16-byte chunk. This chunk may overlap with the 1929 // chunks we've already searched, but that's ok. 1930 MOVQ AX, DI 1931 MOVOU (AX), X1 1932 PCMPEQB X0, X1 1933 PMOVMSKB X1, DX 1934 BSFL DX, DX 1935 JNZ ssesuccess 1936 1937 failure: 1938 MOVQ $-1, (R8) 1939 RET 1940 1941 // We've found a chunk containing the byte. 1942 // The chunk was loaded from DI. 1943 // The index of the matching byte in the chunk is DX. 1944 // The start of the data is SI. 1945 ssesuccess: 1946 SUBQ SI, DI // Compute offset of chunk within data. 1947 ADDQ DX, DI // Add offset of byte within chunk. 1948 MOVQ DI, (R8) 1949 RET 1950 1951 // handle for lengths < 16 1952 small: 1953 TESTQ BX, BX 1954 JEQ failure 1955 1956 // Check if we'll load across a page boundary. 1957 LEAQ 16(SI), AX 1958 TESTW $0xff0, AX 1959 JEQ endofpage 1960 1961 MOVOU (SI), X1 // Load data 1962 PCMPEQB X0, X1 // Compare target byte with each byte in data. 1963 PMOVMSKB X1, DX // Move result bits to integer register. 1964 BSFL DX, DX // Find first set bit. 1965 JZ failure // No set bit, failure. 1966 CMPL DX, BX 1967 JAE failure // Match is past end of data. 1968 MOVQ DX, (R8) 1969 RET 1970 1971 endofpage: 1972 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 1973 PCMPEQB X0, X1 // Compare target byte with each byte in data. 1974 PMOVMSKB X1, DX // Move result bits to integer register. 1975 MOVL BX, CX 1976 SHLL CX, DX 1977 SHRL $16, DX // Shift desired bits down to bottom of register. 1978 BSFL DX, DX // Find first set bit. 1979 JZ failure // No set bit, failure. 1980 MOVQ DX, (R8) 1981 RET 1982 1983 avx2: 1984 CMPB runtime·support_avx2(SB), $1 1985 JNE sse 1986 MOVD AX, X0 1987 LEAQ -32(SI)(BX*1), R11 1988 VPBROADCASTB X0, Y1 1989 avx2_loop: 1990 VMOVDQU (DI), Y2 1991 VPCMPEQB Y1, Y2, Y3 1992 VPTEST Y3, Y3 1993 JNZ avx2success 1994 ADDQ $32, DI 1995 CMPQ DI, R11 1996 JLT avx2_loop 1997 MOVQ R11, DI 1998 VMOVDQU (DI), Y2 1999 VPCMPEQB Y1, Y2, Y3 2000 VPTEST Y3, Y3 2001 JNZ avx2success 2002 VZEROUPPER 2003 MOVQ $-1, (R8) 2004 RET 2005 2006 avx2success: 2007 VPMOVMSKB Y3, DX 2008 BSFL DX, DX 2009 SUBQ SI, DI 2010 ADDQ DI, DX 2011 MOVQ DX, (R8) 2012 VZEROUPPER 2013 RET 2014 2015 TEXT bytes·Equal(SB),NOSPLIT,$0-49 2016 MOVQ a_len+8(FP), BX 2017 MOVQ b_len+32(FP), CX 2018 CMPQ BX, CX 2019 JNE eqret 2020 MOVQ a+0(FP), SI 2021 MOVQ b+24(FP), DI 2022 LEAQ ret+48(FP), AX 2023 JMP runtime·memeqbody(SB) 2024 eqret: 2025 MOVB $0, ret+48(FP) 2026 RET 2027 2028 TEXT runtime·fastrand1(SB), NOSPLIT, $0-4 2029 get_tls(CX) 2030 MOVQ g(CX), AX 2031 MOVQ g_m(AX), AX 2032 MOVL m_fastrand(AX), DX 2033 ADDL DX, DX 2034 MOVL DX, BX 2035 XORL $0x88888eef, DX 2036 CMOVLMI BX, DX 2037 MOVL DX, m_fastrand(AX) 2038 MOVL DX, ret+0(FP) 2039 RET 2040 2041 TEXT runtime·return0(SB), NOSPLIT, $0 2042 MOVL $0, AX 2043 RET 2044 2045 2046 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 2047 // Must obey the gcc calling convention. 2048 TEXT _cgo_topofstack(SB),NOSPLIT,$0 2049 get_tls(CX) 2050 MOVQ g(CX), AX 2051 MOVQ g_m(AX), AX 2052 MOVQ m_curg(AX), AX 2053 MOVQ (g_stack+stack_hi)(AX), AX 2054 RET 2055 2056 // The top-most function running on a goroutine 2057 // returns to goexit+PCQuantum. 2058 TEXT runtime·goexit(SB),NOSPLIT,$0-0 2059 BYTE $0x90 // NOP 2060 CALL runtime·goexit1(SB) // does not return 2061 // traceback from goexit1 must hit code range of goexit 2062 BYTE $0x90 // NOP 2063 2064 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8 2065 MOVQ addr+0(FP), AX 2066 PREFETCHT0 (AX) 2067 RET 2068 2069 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8 2070 MOVQ addr+0(FP), AX 2071 PREFETCHT1 (AX) 2072 RET 2073 2074 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8 2075 MOVQ addr+0(FP), AX 2076 PREFETCHT2 (AX) 2077 RET 2078 2079 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8 2080 MOVQ addr+0(FP), AX 2081 PREFETCHNTA (AX) 2082 RET 2083 2084 // This is called from .init_array and follows the platform, not Go, ABI. 2085 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 2086 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 2087 MOVQ runtime·lastmoduledatap(SB), AX 2088 MOVQ DI, moduledata_next(AX) 2089 MOVQ DI, runtime·lastmoduledatap(SB) 2090 POPQ R15 2091 RET