github.com/karrick/go@v0.0.0-20170817181416-d5b0ec858b37/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVL $0, AX 30 CPUID 31 MOVL AX, SI 32 CMPL AX, $0 33 JE nocpuinfo 34 35 // Figure out how to serialize RDTSC. 36 // On Intel processors LFENCE is enough. AMD requires MFENCE. 37 // Don't know about the rest, so let's do MFENCE. 38 CMPL BX, $0x756E6547 // "Genu" 39 JNE notintel 40 CMPL DX, $0x49656E69 // "ineI" 41 JNE notintel 42 CMPL CX, $0x6C65746E // "ntel" 43 JNE notintel 44 MOVB $1, runtime·isIntel(SB) 45 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 46 notintel: 47 48 // Load EAX=1 cpuid flags 49 MOVL $1, AX 50 CPUID 51 MOVL AX, runtime·processorVersionInfo(SB) 52 53 TESTL $(1<<26), DX // SSE2 54 SETNE runtime·support_sse2(SB) 55 56 TESTL $(1<<9), CX // SSSE3 57 SETNE runtime·support_ssse3(SB) 58 59 TESTL $(1<<19), CX // SSE4.1 60 SETNE runtime·support_sse41(SB) 61 62 TESTL $(1<<20), CX // SSE4.2 63 SETNE runtime·support_sse42(SB) 64 65 TESTL $(1<<23), CX // POPCNT 66 SETNE runtime·support_popcnt(SB) 67 68 TESTL $(1<<25), CX // AES 69 SETNE runtime·support_aes(SB) 70 71 TESTL $(1<<27), CX // OSXSAVE 72 SETNE runtime·support_osxsave(SB) 73 74 // If OS support for XMM and YMM is not present 75 // support_avx will be set back to false later. 76 TESTL $(1<<28), CX // AVX 77 SETNE runtime·support_avx(SB) 78 79 eax7: 80 // Load EAX=7/ECX=0 cpuid flags 81 CMPL SI, $7 82 JLT osavx 83 MOVL $7, AX 84 MOVL $0, CX 85 CPUID 86 87 TESTL $(1<<3), BX // BMI1 88 SETNE runtime·support_bmi1(SB) 89 90 // If OS support for XMM and YMM is not present 91 // support_avx2 will be set back to false later. 92 TESTL $(1<<5), BX 93 SETNE runtime·support_avx2(SB) 94 95 TESTL $(1<<8), BX // BMI2 96 SETNE runtime·support_bmi2(SB) 97 98 TESTL $(1<<9), BX // ERMS 99 SETNE runtime·support_erms(SB) 100 101 osavx: 102 CMPB runtime·support_osxsave(SB), $1 103 JNE noavx 104 MOVL $0, CX 105 // For XGETBV, OSXSAVE bit is required and sufficient 106 XGETBV 107 ANDL $6, AX 108 CMPL AX, $6 // Check for OS support of XMM and YMM registers. 109 JE nocpuinfo 110 noavx: 111 MOVB $0, runtime·support_avx(SB) 112 MOVB $0, runtime·support_avx2(SB) 113 114 nocpuinfo: 115 // if there is an _cgo_init, call it. 116 MOVQ _cgo_init(SB), AX 117 TESTQ AX, AX 118 JZ needtls 119 // g0 already in DI 120 MOVQ DI, CX // Win64 uses CX for first parameter 121 MOVQ $setg_gcc<>(SB), SI 122 CALL AX 123 124 // update stackguard after _cgo_init 125 MOVQ $runtime·g0(SB), CX 126 MOVQ (g_stack+stack_lo)(CX), AX 127 ADDQ $const__StackGuard, AX 128 MOVQ AX, g_stackguard0(CX) 129 MOVQ AX, g_stackguard1(CX) 130 131 #ifndef GOOS_windows 132 JMP ok 133 #endif 134 needtls: 135 #ifdef GOOS_plan9 136 // skip TLS setup on Plan 9 137 JMP ok 138 #endif 139 #ifdef GOOS_solaris 140 // skip TLS setup on Solaris 141 JMP ok 142 #endif 143 144 LEAQ runtime·m0+m_tls(SB), DI 145 CALL runtime·settls(SB) 146 147 // store through it, to make sure it works 148 get_tls(BX) 149 MOVQ $0x123, g(BX) 150 MOVQ runtime·m0+m_tls(SB), AX 151 CMPQ AX, $0x123 152 JEQ 2(PC) 153 MOVL AX, 0 // abort 154 ok: 155 // set the per-goroutine and per-mach "registers" 156 get_tls(BX) 157 LEAQ runtime·g0(SB), CX 158 MOVQ CX, g(BX) 159 LEAQ runtime·m0(SB), AX 160 161 // save m->g0 = g0 162 MOVQ CX, m_g0(AX) 163 // save m0 to g0->m 164 MOVQ AX, g_m(CX) 165 166 CLD // convention is D is always left cleared 167 CALL runtime·check(SB) 168 169 MOVL 16(SP), AX // copy argc 170 MOVL AX, 0(SP) 171 MOVQ 24(SP), AX // copy argv 172 MOVQ AX, 8(SP) 173 CALL runtime·args(SB) 174 CALL runtime·osinit(SB) 175 CALL runtime·schedinit(SB) 176 177 // create a new goroutine to start program 178 MOVQ $runtime·mainPC(SB), AX // entry 179 PUSHQ AX 180 PUSHQ $0 // arg size 181 CALL runtime·newproc(SB) 182 POPQ AX 183 POPQ AX 184 185 // start this M 186 CALL runtime·mstart(SB) 187 188 MOVL $0xf1, 0xf1 // crash 189 RET 190 191 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 192 GLOBL runtime·mainPC(SB),RODATA,$8 193 194 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 195 BYTE $0xcc 196 RET 197 198 TEXT runtime·asminit(SB),NOSPLIT,$0-0 199 // No per-thread init. 200 RET 201 202 /* 203 * go-routine 204 */ 205 206 // void gosave(Gobuf*) 207 // save state in Gobuf; setjmp 208 TEXT runtime·gosave(SB), NOSPLIT, $0-8 209 MOVQ buf+0(FP), AX // gobuf 210 LEAQ buf+0(FP), BX // caller's SP 211 MOVQ BX, gobuf_sp(AX) 212 MOVQ 0(SP), BX // caller's PC 213 MOVQ BX, gobuf_pc(AX) 214 MOVQ $0, gobuf_ret(AX) 215 MOVQ BP, gobuf_bp(AX) 216 // Assert ctxt is zero. See func save. 217 MOVQ gobuf_ctxt(AX), BX 218 TESTQ BX, BX 219 JZ 2(PC) 220 CALL runtime·badctxt(SB) 221 get_tls(CX) 222 MOVQ g(CX), BX 223 MOVQ BX, gobuf_g(AX) 224 RET 225 226 // void gogo(Gobuf*) 227 // restore state from Gobuf; longjmp 228 TEXT runtime·gogo(SB), NOSPLIT, $16-8 229 MOVQ buf+0(FP), BX // gobuf 230 231 // If ctxt is not nil, invoke deletion barrier before overwriting. 232 MOVQ gobuf_ctxt(BX), AX 233 TESTQ AX, AX 234 JZ nilctxt 235 LEAQ gobuf_ctxt(BX), AX 236 MOVQ AX, 0(SP) 237 MOVQ $0, 8(SP) 238 CALL runtime·writebarrierptr_prewrite(SB) 239 MOVQ buf+0(FP), BX 240 241 nilctxt: 242 MOVQ gobuf_g(BX), DX 243 MOVQ 0(DX), CX // make sure g != nil 244 get_tls(CX) 245 MOVQ DX, g(CX) 246 MOVQ gobuf_sp(BX), SP // restore SP 247 MOVQ gobuf_ret(BX), AX 248 MOVQ gobuf_ctxt(BX), DX 249 MOVQ gobuf_bp(BX), BP 250 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 251 MOVQ $0, gobuf_ret(BX) 252 MOVQ $0, gobuf_ctxt(BX) 253 MOVQ $0, gobuf_bp(BX) 254 MOVQ gobuf_pc(BX), BX 255 JMP BX 256 257 // func mcall(fn func(*g)) 258 // Switch to m->g0's stack, call fn(g). 259 // Fn must never return. It should gogo(&g->sched) 260 // to keep running g. 261 TEXT runtime·mcall(SB), NOSPLIT, $0-8 262 MOVQ fn+0(FP), DI 263 264 get_tls(CX) 265 MOVQ g(CX), AX // save state in g->sched 266 MOVQ 0(SP), BX // caller's PC 267 MOVQ BX, (g_sched+gobuf_pc)(AX) 268 LEAQ fn+0(FP), BX // caller's SP 269 MOVQ BX, (g_sched+gobuf_sp)(AX) 270 MOVQ AX, (g_sched+gobuf_g)(AX) 271 MOVQ BP, (g_sched+gobuf_bp)(AX) 272 273 // switch to m->g0 & its stack, call fn 274 MOVQ g(CX), BX 275 MOVQ g_m(BX), BX 276 MOVQ m_g0(BX), SI 277 CMPQ SI, AX // if g == m->g0 call badmcall 278 JNE 3(PC) 279 MOVQ $runtime·badmcall(SB), AX 280 JMP AX 281 MOVQ SI, g(CX) // g = m->g0 282 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 283 PUSHQ AX 284 MOVQ DI, DX 285 MOVQ 0(DI), DI 286 CALL DI 287 POPQ AX 288 MOVQ $runtime·badmcall2(SB), AX 289 JMP AX 290 RET 291 292 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 293 // of the G stack. We need to distinguish the routine that 294 // lives at the bottom of the G stack from the one that lives 295 // at the top of the system stack because the one at the top of 296 // the system stack terminates the stack walk (see topofstack()). 297 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 298 RET 299 300 // func systemstack(fn func()) 301 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 302 MOVQ fn+0(FP), DI // DI = fn 303 get_tls(CX) 304 MOVQ g(CX), AX // AX = g 305 MOVQ g_m(AX), BX // BX = m 306 307 MOVQ m_gsignal(BX), DX // DX = gsignal 308 CMPQ AX, DX 309 JEQ noswitch 310 311 MOVQ m_g0(BX), DX // DX = g0 312 CMPQ AX, DX 313 JEQ noswitch 314 315 MOVQ m_curg(BX), R8 316 CMPQ AX, R8 317 JEQ switch 318 319 // Bad: g is not gsignal, not g0, not curg. What is it? 320 MOVQ $runtime·badsystemstack(SB), AX 321 CALL AX 322 323 switch: 324 // save our state in g->sched. Pretend to 325 // be systemstack_switch if the G stack is scanned. 326 MOVQ $runtime·systemstack_switch(SB), SI 327 MOVQ SI, (g_sched+gobuf_pc)(AX) 328 MOVQ SP, (g_sched+gobuf_sp)(AX) 329 MOVQ AX, (g_sched+gobuf_g)(AX) 330 MOVQ BP, (g_sched+gobuf_bp)(AX) 331 332 // switch to g0 333 MOVQ DX, g(CX) 334 MOVQ (g_sched+gobuf_sp)(DX), BX 335 // make it look like mstart called systemstack on g0, to stop traceback 336 SUBQ $8, BX 337 MOVQ $runtime·mstart(SB), DX 338 MOVQ DX, 0(BX) 339 MOVQ BX, SP 340 341 // call target function 342 MOVQ DI, DX 343 MOVQ 0(DI), DI 344 CALL DI 345 346 // switch back to g 347 get_tls(CX) 348 MOVQ g(CX), AX 349 MOVQ g_m(AX), BX 350 MOVQ m_curg(BX), AX 351 MOVQ AX, g(CX) 352 MOVQ (g_sched+gobuf_sp)(AX), SP 353 MOVQ $0, (g_sched+gobuf_sp)(AX) 354 RET 355 356 noswitch: 357 // already on m stack, just call directly 358 MOVQ DI, DX 359 MOVQ 0(DI), DI 360 CALL DI 361 RET 362 363 /* 364 * support for morestack 365 */ 366 367 // Called during function prolog when more stack is needed. 368 // 369 // The traceback routines see morestack on a g0 as being 370 // the top of a stack (for example, morestack calling newstack 371 // calling the scheduler calling newm calling gc), so we must 372 // record an argument size. For that purpose, it has no arguments. 373 TEXT runtime·morestack(SB),NOSPLIT,$0-0 374 // Cannot grow scheduler stack (m->g0). 375 get_tls(CX) 376 MOVQ g(CX), BX 377 MOVQ g_m(BX), BX 378 MOVQ m_g0(BX), SI 379 CMPQ g(CX), SI 380 JNE 3(PC) 381 CALL runtime·badmorestackg0(SB) 382 INT $3 383 384 // Cannot grow signal stack (m->gsignal). 385 MOVQ m_gsignal(BX), SI 386 CMPQ g(CX), SI 387 JNE 3(PC) 388 CALL runtime·badmorestackgsignal(SB) 389 INT $3 390 391 // Called from f. 392 // Set m->morebuf to f's caller. 393 MOVQ 8(SP), AX // f's caller's PC 394 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 395 LEAQ 16(SP), AX // f's caller's SP 396 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 397 get_tls(CX) 398 MOVQ g(CX), SI 399 MOVQ SI, (m_morebuf+gobuf_g)(BX) 400 401 // Set g->sched to context in f. 402 MOVQ 0(SP), AX // f's PC 403 MOVQ AX, (g_sched+gobuf_pc)(SI) 404 MOVQ SI, (g_sched+gobuf_g)(SI) 405 LEAQ 8(SP), AX // f's SP 406 MOVQ AX, (g_sched+gobuf_sp)(SI) 407 MOVQ BP, (g_sched+gobuf_bp)(SI) 408 // newstack will fill gobuf.ctxt. 409 410 // Call newstack on m->g0's stack. 411 MOVQ m_g0(BX), BX 412 MOVQ BX, g(CX) 413 MOVQ (g_sched+gobuf_sp)(BX), SP 414 PUSHQ DX // ctxt argument 415 CALL runtime·newstack(SB) 416 MOVQ $0, 0x1003 // crash if newstack returns 417 POPQ DX // keep balance check happy 418 RET 419 420 // morestack but not preserving ctxt. 421 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 422 MOVL $0, DX 423 JMP runtime·morestack(SB) 424 425 // reflectcall: call a function with the given argument list 426 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 427 // we don't have variable-sized frames, so we use a small number 428 // of constant-sized-frame functions to encode a few bits of size in the pc. 429 // Caution: ugly multiline assembly macros in your future! 430 431 #define DISPATCH(NAME,MAXSIZE) \ 432 CMPQ CX, $MAXSIZE; \ 433 JA 3(PC); \ 434 MOVQ $NAME(SB), AX; \ 435 JMP AX 436 // Note: can't just "JMP NAME(SB)" - bad inlining results. 437 438 TEXT reflect·call(SB), NOSPLIT, $0-0 439 JMP ·reflectcall(SB) 440 441 TEXT ·reflectcall(SB), NOSPLIT, $0-32 442 MOVLQZX argsize+24(FP), CX 443 DISPATCH(runtime·call32, 32) 444 DISPATCH(runtime·call64, 64) 445 DISPATCH(runtime·call128, 128) 446 DISPATCH(runtime·call256, 256) 447 DISPATCH(runtime·call512, 512) 448 DISPATCH(runtime·call1024, 1024) 449 DISPATCH(runtime·call2048, 2048) 450 DISPATCH(runtime·call4096, 4096) 451 DISPATCH(runtime·call8192, 8192) 452 DISPATCH(runtime·call16384, 16384) 453 DISPATCH(runtime·call32768, 32768) 454 DISPATCH(runtime·call65536, 65536) 455 DISPATCH(runtime·call131072, 131072) 456 DISPATCH(runtime·call262144, 262144) 457 DISPATCH(runtime·call524288, 524288) 458 DISPATCH(runtime·call1048576, 1048576) 459 DISPATCH(runtime·call2097152, 2097152) 460 DISPATCH(runtime·call4194304, 4194304) 461 DISPATCH(runtime·call8388608, 8388608) 462 DISPATCH(runtime·call16777216, 16777216) 463 DISPATCH(runtime·call33554432, 33554432) 464 DISPATCH(runtime·call67108864, 67108864) 465 DISPATCH(runtime·call134217728, 134217728) 466 DISPATCH(runtime·call268435456, 268435456) 467 DISPATCH(runtime·call536870912, 536870912) 468 DISPATCH(runtime·call1073741824, 1073741824) 469 MOVQ $runtime·badreflectcall(SB), AX 470 JMP AX 471 472 #define CALLFN(NAME,MAXSIZE) \ 473 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 474 NO_LOCAL_POINTERS; \ 475 /* copy arguments to stack */ \ 476 MOVQ argptr+16(FP), SI; \ 477 MOVLQZX argsize+24(FP), CX; \ 478 MOVQ SP, DI; \ 479 REP;MOVSB; \ 480 /* call function */ \ 481 MOVQ f+8(FP), DX; \ 482 PCDATA $PCDATA_StackMapIndex, $0; \ 483 CALL (DX); \ 484 /* copy return values back */ \ 485 MOVQ argtype+0(FP), DX; \ 486 MOVQ argptr+16(FP), DI; \ 487 MOVLQZX argsize+24(FP), CX; \ 488 MOVLQZX retoffset+28(FP), BX; \ 489 MOVQ SP, SI; \ 490 ADDQ BX, DI; \ 491 ADDQ BX, SI; \ 492 SUBQ BX, CX; \ 493 CALL callRet<>(SB); \ 494 RET 495 496 // callRet copies return values back at the end of call*. This is a 497 // separate function so it can allocate stack space for the arguments 498 // to reflectcallmove. It does not follow the Go ABI; it expects its 499 // arguments in registers. 500 TEXT callRet<>(SB), NOSPLIT, $32-0 501 NO_LOCAL_POINTERS 502 MOVQ DX, 0(SP) 503 MOVQ DI, 8(SP) 504 MOVQ SI, 16(SP) 505 MOVQ CX, 24(SP) 506 CALL runtime·reflectcallmove(SB) 507 RET 508 509 CALLFN(·call32, 32) 510 CALLFN(·call64, 64) 511 CALLFN(·call128, 128) 512 CALLFN(·call256, 256) 513 CALLFN(·call512, 512) 514 CALLFN(·call1024, 1024) 515 CALLFN(·call2048, 2048) 516 CALLFN(·call4096, 4096) 517 CALLFN(·call8192, 8192) 518 CALLFN(·call16384, 16384) 519 CALLFN(·call32768, 32768) 520 CALLFN(·call65536, 65536) 521 CALLFN(·call131072, 131072) 522 CALLFN(·call262144, 262144) 523 CALLFN(·call524288, 524288) 524 CALLFN(·call1048576, 1048576) 525 CALLFN(·call2097152, 2097152) 526 CALLFN(·call4194304, 4194304) 527 CALLFN(·call8388608, 8388608) 528 CALLFN(·call16777216, 16777216) 529 CALLFN(·call33554432, 33554432) 530 CALLFN(·call67108864, 67108864) 531 CALLFN(·call134217728, 134217728) 532 CALLFN(·call268435456, 268435456) 533 CALLFN(·call536870912, 536870912) 534 CALLFN(·call1073741824, 1073741824) 535 536 TEXT runtime·procyield(SB),NOSPLIT,$0-0 537 MOVL cycles+0(FP), AX 538 again: 539 PAUSE 540 SUBL $1, AX 541 JNZ again 542 RET 543 544 545 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 546 // Stores are already ordered on x86, so this is just a 547 // compile barrier. 548 RET 549 550 // void jmpdefer(fn, sp); 551 // called from deferreturn. 552 // 1. pop the caller 553 // 2. sub 5 bytes from the callers return 554 // 3. jmp to the argument 555 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 556 MOVQ fv+0(FP), DX // fn 557 MOVQ argp+8(FP), BX // caller sp 558 LEAQ -8(BX), SP // caller sp after CALL 559 MOVQ -8(SP), BP // restore BP as if deferreturn returned (harmless if framepointers not in use) 560 SUBQ $5, (SP) // return to CALL again 561 MOVQ 0(DX), BX 562 JMP BX // but first run the deferred function 563 564 // Save state of caller into g->sched. Smashes R8, R9. 565 TEXT gosave<>(SB),NOSPLIT,$0 566 get_tls(R8) 567 MOVQ g(R8), R8 568 MOVQ 0(SP), R9 569 MOVQ R9, (g_sched+gobuf_pc)(R8) 570 LEAQ 8(SP), R9 571 MOVQ R9, (g_sched+gobuf_sp)(R8) 572 MOVQ $0, (g_sched+gobuf_ret)(R8) 573 MOVQ BP, (g_sched+gobuf_bp)(R8) 574 // Assert ctxt is zero. See func save. 575 MOVQ (g_sched+gobuf_ctxt)(R8), R9 576 TESTQ R9, R9 577 JZ 2(PC) 578 CALL runtime·badctxt(SB) 579 RET 580 581 // func asmcgocall(fn, arg unsafe.Pointer) int32 582 // Call fn(arg) on the scheduler stack, 583 // aligned appropriately for the gcc ABI. 584 // See cgocall.go for more details. 585 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 586 MOVQ fn+0(FP), AX 587 MOVQ arg+8(FP), BX 588 589 MOVQ SP, DX 590 591 // Figure out if we need to switch to m->g0 stack. 592 // We get called to create new OS threads too, and those 593 // come in on the m->g0 stack already. 594 get_tls(CX) 595 MOVQ g(CX), R8 596 CMPQ R8, $0 597 JEQ nosave 598 MOVQ g_m(R8), R8 599 MOVQ m_g0(R8), SI 600 MOVQ g(CX), DI 601 CMPQ SI, DI 602 JEQ nosave 603 MOVQ m_gsignal(R8), SI 604 CMPQ SI, DI 605 JEQ nosave 606 607 // Switch to system stack. 608 MOVQ m_g0(R8), SI 609 CALL gosave<>(SB) 610 MOVQ SI, g(CX) 611 MOVQ (g_sched+gobuf_sp)(SI), SP 612 613 // Now on a scheduling stack (a pthread-created stack). 614 // Make sure we have enough room for 4 stack-backed fast-call 615 // registers as per windows amd64 calling convention. 616 SUBQ $64, SP 617 ANDQ $~15, SP // alignment for gcc ABI 618 MOVQ DI, 48(SP) // save g 619 MOVQ (g_stack+stack_hi)(DI), DI 620 SUBQ DX, DI 621 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 622 MOVQ BX, DI // DI = first argument in AMD64 ABI 623 MOVQ BX, CX // CX = first argument in Win64 624 CALL AX 625 626 // Restore registers, g, stack pointer. 627 get_tls(CX) 628 MOVQ 48(SP), DI 629 MOVQ (g_stack+stack_hi)(DI), SI 630 SUBQ 40(SP), SI 631 MOVQ DI, g(CX) 632 MOVQ SI, SP 633 634 MOVL AX, ret+16(FP) 635 RET 636 637 nosave: 638 // Running on a system stack, perhaps even without a g. 639 // Having no g can happen during thread creation or thread teardown 640 // (see needm/dropm on Solaris, for example). 641 // This code is like the above sequence but without saving/restoring g 642 // and without worrying about the stack moving out from under us 643 // (because we're on a system stack, not a goroutine stack). 644 // The above code could be used directly if already on a system stack, 645 // but then the only path through this code would be a rare case on Solaris. 646 // Using this code for all "already on system stack" calls exercises it more, 647 // which should help keep it correct. 648 SUBQ $64, SP 649 ANDQ $~15, SP 650 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging 651 MOVQ DX, 40(SP) // save original stack pointer 652 MOVQ BX, DI // DI = first argument in AMD64 ABI 653 MOVQ BX, CX // CX = first argument in Win64 654 CALL AX 655 MOVQ 40(SP), SI // restore original stack pointer 656 MOVQ SI, SP 657 MOVL AX, ret+16(FP) 658 RET 659 660 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt) 661 // Turn the fn into a Go func (by taking its address) and call 662 // cgocallback_gofunc. 663 TEXT runtime·cgocallback(SB),NOSPLIT,$32-32 664 LEAQ fn+0(FP), AX 665 MOVQ AX, 0(SP) 666 MOVQ frame+8(FP), AX 667 MOVQ AX, 8(SP) 668 MOVQ framesize+16(FP), AX 669 MOVQ AX, 16(SP) 670 MOVQ ctxt+24(FP), AX 671 MOVQ AX, 24(SP) 672 MOVQ $runtime·cgocallback_gofunc(SB), AX 673 CALL AX 674 RET 675 676 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt) 677 // See cgocall.go for more details. 678 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32 679 NO_LOCAL_POINTERS 680 681 // If g is nil, Go did not create the current thread. 682 // Call needm to obtain one m for temporary use. 683 // In this case, we're running on the thread stack, so there's 684 // lots of space, but the linker doesn't know. Hide the call from 685 // the linker analysis by using an indirect call through AX. 686 get_tls(CX) 687 #ifdef GOOS_windows 688 MOVL $0, BX 689 CMPQ CX, $0 690 JEQ 2(PC) 691 #endif 692 MOVQ g(CX), BX 693 CMPQ BX, $0 694 JEQ needm 695 MOVQ g_m(BX), BX 696 MOVQ BX, R8 // holds oldm until end of function 697 JMP havem 698 needm: 699 MOVQ $0, 0(SP) 700 MOVQ $runtime·needm(SB), AX 701 CALL AX 702 MOVQ 0(SP), R8 703 get_tls(CX) 704 MOVQ g(CX), BX 705 MOVQ g_m(BX), BX 706 707 // Set m->sched.sp = SP, so that if a panic happens 708 // during the function we are about to execute, it will 709 // have a valid SP to run on the g0 stack. 710 // The next few lines (after the havem label) 711 // will save this SP onto the stack and then write 712 // the same SP back to m->sched.sp. That seems redundant, 713 // but if an unrecovered panic happens, unwindm will 714 // restore the g->sched.sp from the stack location 715 // and then systemstack will try to use it. If we don't set it here, 716 // that restored SP will be uninitialized (typically 0) and 717 // will not be usable. 718 MOVQ m_g0(BX), SI 719 MOVQ SP, (g_sched+gobuf_sp)(SI) 720 721 havem: 722 // Now there's a valid m, and we're running on its m->g0. 723 // Save current m->g0->sched.sp on stack and then set it to SP. 724 // Save current sp in m->g0->sched.sp in preparation for 725 // switch back to m->curg stack. 726 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 727 MOVQ m_g0(BX), SI 728 MOVQ (g_sched+gobuf_sp)(SI), AX 729 MOVQ AX, 0(SP) 730 MOVQ SP, (g_sched+gobuf_sp)(SI) 731 732 // Switch to m->curg stack and call runtime.cgocallbackg. 733 // Because we are taking over the execution of m->curg 734 // but *not* resuming what had been running, we need to 735 // save that information (m->curg->sched) so we can restore it. 736 // We can restore m->curg->sched.sp easily, because calling 737 // runtime.cgocallbackg leaves SP unchanged upon return. 738 // To save m->curg->sched.pc, we push it onto the stack. 739 // This has the added benefit that it looks to the traceback 740 // routine like cgocallbackg is going to return to that 741 // PC (because the frame we allocate below has the same 742 // size as cgocallback_gofunc's frame declared above) 743 // so that the traceback will seamlessly trace back into 744 // the earlier calls. 745 // 746 // In the new goroutine, 8(SP) holds the saved R8. 747 MOVQ m_curg(BX), SI 748 MOVQ SI, g(CX) 749 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 750 MOVQ (g_sched+gobuf_pc)(SI), BX 751 MOVQ BX, -8(DI) 752 // Compute the size of the frame, including return PC and, if 753 // GOEXPERIMENT=framepointer, the saved base pointer 754 MOVQ ctxt+24(FP), BX 755 LEAQ fv+0(FP), AX 756 SUBQ SP, AX 757 SUBQ AX, DI 758 MOVQ DI, SP 759 760 MOVQ R8, 8(SP) 761 MOVQ BX, 0(SP) 762 CALL runtime·cgocallbackg(SB) 763 MOVQ 8(SP), R8 764 765 // Compute the size of the frame again. FP and SP have 766 // completely different values here than they did above, 767 // but only their difference matters. 768 LEAQ fv+0(FP), AX 769 SUBQ SP, AX 770 771 // Restore g->sched (== m->curg->sched) from saved values. 772 get_tls(CX) 773 MOVQ g(CX), SI 774 MOVQ SP, DI 775 ADDQ AX, DI 776 MOVQ -8(DI), BX 777 MOVQ BX, (g_sched+gobuf_pc)(SI) 778 MOVQ DI, (g_sched+gobuf_sp)(SI) 779 780 // Switch back to m->g0's stack and restore m->g0->sched.sp. 781 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 782 // so we do not have to restore it.) 783 MOVQ g(CX), BX 784 MOVQ g_m(BX), BX 785 MOVQ m_g0(BX), SI 786 MOVQ SI, g(CX) 787 MOVQ (g_sched+gobuf_sp)(SI), SP 788 MOVQ 0(SP), AX 789 MOVQ AX, (g_sched+gobuf_sp)(SI) 790 791 // If the m on entry was nil, we called needm above to borrow an m 792 // for the duration of the call. Since the call is over, return it with dropm. 793 CMPQ R8, $0 794 JNE 3(PC) 795 MOVQ $runtime·dropm(SB), AX 796 CALL AX 797 798 // Done! 799 RET 800 801 // void setg(G*); set g. for use by needm. 802 TEXT runtime·setg(SB), NOSPLIT, $0-8 803 MOVQ gg+0(FP), BX 804 #ifdef GOOS_windows 805 CMPQ BX, $0 806 JNE settls 807 MOVQ $0, 0x28(GS) 808 RET 809 settls: 810 MOVQ g_m(BX), AX 811 LEAQ m_tls(AX), AX 812 MOVQ AX, 0x28(GS) 813 #endif 814 get_tls(CX) 815 MOVQ BX, g(CX) 816 RET 817 818 // void setg_gcc(G*); set g called from gcc. 819 TEXT setg_gcc<>(SB),NOSPLIT,$0 820 get_tls(AX) 821 MOVQ DI, g(AX) 822 RET 823 824 // check that SP is in range [g->stack.lo, g->stack.hi) 825 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 826 get_tls(CX) 827 MOVQ g(CX), AX 828 CMPQ (g_stack+stack_hi)(AX), SP 829 JHI 2(PC) 830 INT $3 831 CMPQ SP, (g_stack+stack_lo)(AX) 832 JHI 2(PC) 833 INT $3 834 RET 835 836 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 837 MOVQ argp+0(FP),AX // addr of first arg 838 MOVQ -8(AX),AX // get calling pc 839 MOVQ AX, ret+8(FP) 840 RET 841 842 // func cputicks() int64 843 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 844 CMPB runtime·lfenceBeforeRdtsc(SB), $1 845 JNE mfence 846 LFENCE 847 JMP done 848 mfence: 849 MFENCE 850 done: 851 RDTSC 852 SHLQ $32, DX 853 ADDQ DX, AX 854 MOVQ AX, ret+0(FP) 855 RET 856 857 // hash function using AES hardware instructions 858 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 859 MOVQ p+0(FP), AX // ptr to data 860 MOVQ s+16(FP), CX // size 861 LEAQ ret+24(FP), DX 862 JMP runtime·aeshashbody(SB) 863 864 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 865 MOVQ p+0(FP), AX // ptr to string struct 866 MOVQ 8(AX), CX // length of string 867 MOVQ (AX), AX // string data 868 LEAQ ret+16(FP), DX 869 JMP runtime·aeshashbody(SB) 870 871 // AX: data 872 // CX: length 873 // DX: address to put return value 874 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 875 // Fill an SSE register with our seeds. 876 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 877 PINSRW $4, CX, X0 // 16 bits of length 878 PSHUFHW $0, X0, X0 // repeat length 4 times total 879 MOVO X0, X1 // save unscrambled seed 880 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 881 AESENC X0, X0 // scramble seed 882 883 CMPQ CX, $16 884 JB aes0to15 885 JE aes16 886 CMPQ CX, $32 887 JBE aes17to32 888 CMPQ CX, $64 889 JBE aes33to64 890 CMPQ CX, $128 891 JBE aes65to128 892 JMP aes129plus 893 894 aes0to15: 895 TESTQ CX, CX 896 JE aes0 897 898 ADDQ $16, AX 899 TESTW $0xff0, AX 900 JE endofpage 901 902 // 16 bytes loaded at this address won't cross 903 // a page boundary, so we can load it directly. 904 MOVOU -16(AX), X1 905 ADDQ CX, CX 906 MOVQ $masks<>(SB), AX 907 PAND (AX)(CX*8), X1 908 final1: 909 PXOR X0, X1 // xor data with seed 910 AESENC X1, X1 // scramble combo 3 times 911 AESENC X1, X1 912 AESENC X1, X1 913 MOVQ X1, (DX) 914 RET 915 916 endofpage: 917 // address ends in 1111xxxx. Might be up against 918 // a page boundary, so load ending at last byte. 919 // Then shift bytes down using pshufb. 920 MOVOU -32(AX)(CX*1), X1 921 ADDQ CX, CX 922 MOVQ $shifts<>(SB), AX 923 PSHUFB (AX)(CX*8), X1 924 JMP final1 925 926 aes0: 927 // Return scrambled input seed 928 AESENC X0, X0 929 MOVQ X0, (DX) 930 RET 931 932 aes16: 933 MOVOU (AX), X1 934 JMP final1 935 936 aes17to32: 937 // make second starting seed 938 PXOR runtime·aeskeysched+16(SB), X1 939 AESENC X1, X1 940 941 // load data to be hashed 942 MOVOU (AX), X2 943 MOVOU -16(AX)(CX*1), X3 944 945 // xor with seed 946 PXOR X0, X2 947 PXOR X1, X3 948 949 // scramble 3 times 950 AESENC X2, X2 951 AESENC X3, X3 952 AESENC X2, X2 953 AESENC X3, X3 954 AESENC X2, X2 955 AESENC X3, X3 956 957 // combine results 958 PXOR X3, X2 959 MOVQ X2, (DX) 960 RET 961 962 aes33to64: 963 // make 3 more starting seeds 964 MOVO X1, X2 965 MOVO X1, X3 966 PXOR runtime·aeskeysched+16(SB), X1 967 PXOR runtime·aeskeysched+32(SB), X2 968 PXOR runtime·aeskeysched+48(SB), X3 969 AESENC X1, X1 970 AESENC X2, X2 971 AESENC X3, X3 972 973 MOVOU (AX), X4 974 MOVOU 16(AX), X5 975 MOVOU -32(AX)(CX*1), X6 976 MOVOU -16(AX)(CX*1), X7 977 978 PXOR X0, X4 979 PXOR X1, X5 980 PXOR X2, X6 981 PXOR X3, X7 982 983 AESENC X4, X4 984 AESENC X5, X5 985 AESENC X6, X6 986 AESENC X7, X7 987 988 AESENC X4, X4 989 AESENC X5, X5 990 AESENC X6, X6 991 AESENC X7, X7 992 993 AESENC X4, X4 994 AESENC X5, X5 995 AESENC X6, X6 996 AESENC X7, X7 997 998 PXOR X6, X4 999 PXOR X7, X5 1000 PXOR X5, X4 1001 MOVQ X4, (DX) 1002 RET 1003 1004 aes65to128: 1005 // make 7 more starting seeds 1006 MOVO X1, X2 1007 MOVO X1, X3 1008 MOVO X1, X4 1009 MOVO X1, X5 1010 MOVO X1, X6 1011 MOVO X1, X7 1012 PXOR runtime·aeskeysched+16(SB), X1 1013 PXOR runtime·aeskeysched+32(SB), X2 1014 PXOR runtime·aeskeysched+48(SB), X3 1015 PXOR runtime·aeskeysched+64(SB), X4 1016 PXOR runtime·aeskeysched+80(SB), X5 1017 PXOR runtime·aeskeysched+96(SB), X6 1018 PXOR runtime·aeskeysched+112(SB), X7 1019 AESENC X1, X1 1020 AESENC X2, X2 1021 AESENC X3, X3 1022 AESENC X4, X4 1023 AESENC X5, X5 1024 AESENC X6, X6 1025 AESENC X7, X7 1026 1027 // load data 1028 MOVOU (AX), X8 1029 MOVOU 16(AX), X9 1030 MOVOU 32(AX), X10 1031 MOVOU 48(AX), X11 1032 MOVOU -64(AX)(CX*1), X12 1033 MOVOU -48(AX)(CX*1), X13 1034 MOVOU -32(AX)(CX*1), X14 1035 MOVOU -16(AX)(CX*1), X15 1036 1037 // xor with seed 1038 PXOR X0, X8 1039 PXOR X1, X9 1040 PXOR X2, X10 1041 PXOR X3, X11 1042 PXOR X4, X12 1043 PXOR X5, X13 1044 PXOR X6, X14 1045 PXOR X7, X15 1046 1047 // scramble 3 times 1048 AESENC X8, X8 1049 AESENC X9, X9 1050 AESENC X10, X10 1051 AESENC X11, X11 1052 AESENC X12, X12 1053 AESENC X13, X13 1054 AESENC X14, X14 1055 AESENC X15, X15 1056 1057 AESENC X8, X8 1058 AESENC X9, X9 1059 AESENC X10, X10 1060 AESENC X11, X11 1061 AESENC X12, X12 1062 AESENC X13, X13 1063 AESENC X14, X14 1064 AESENC X15, X15 1065 1066 AESENC X8, X8 1067 AESENC X9, X9 1068 AESENC X10, X10 1069 AESENC X11, X11 1070 AESENC X12, X12 1071 AESENC X13, X13 1072 AESENC X14, X14 1073 AESENC X15, X15 1074 1075 // combine results 1076 PXOR X12, X8 1077 PXOR X13, X9 1078 PXOR X14, X10 1079 PXOR X15, X11 1080 PXOR X10, X8 1081 PXOR X11, X9 1082 PXOR X9, X8 1083 MOVQ X8, (DX) 1084 RET 1085 1086 aes129plus: 1087 // make 7 more starting seeds 1088 MOVO X1, X2 1089 MOVO X1, X3 1090 MOVO X1, X4 1091 MOVO X1, X5 1092 MOVO X1, X6 1093 MOVO X1, X7 1094 PXOR runtime·aeskeysched+16(SB), X1 1095 PXOR runtime·aeskeysched+32(SB), X2 1096 PXOR runtime·aeskeysched+48(SB), X3 1097 PXOR runtime·aeskeysched+64(SB), X4 1098 PXOR runtime·aeskeysched+80(SB), X5 1099 PXOR runtime·aeskeysched+96(SB), X6 1100 PXOR runtime·aeskeysched+112(SB), X7 1101 AESENC X1, X1 1102 AESENC X2, X2 1103 AESENC X3, X3 1104 AESENC X4, X4 1105 AESENC X5, X5 1106 AESENC X6, X6 1107 AESENC X7, X7 1108 1109 // start with last (possibly overlapping) block 1110 MOVOU -128(AX)(CX*1), X8 1111 MOVOU -112(AX)(CX*1), X9 1112 MOVOU -96(AX)(CX*1), X10 1113 MOVOU -80(AX)(CX*1), X11 1114 MOVOU -64(AX)(CX*1), X12 1115 MOVOU -48(AX)(CX*1), X13 1116 MOVOU -32(AX)(CX*1), X14 1117 MOVOU -16(AX)(CX*1), X15 1118 1119 // xor in seed 1120 PXOR X0, X8 1121 PXOR X1, X9 1122 PXOR X2, X10 1123 PXOR X3, X11 1124 PXOR X4, X12 1125 PXOR X5, X13 1126 PXOR X6, X14 1127 PXOR X7, X15 1128 1129 // compute number of remaining 128-byte blocks 1130 DECQ CX 1131 SHRQ $7, CX 1132 1133 aesloop: 1134 // scramble state 1135 AESENC X8, X8 1136 AESENC X9, X9 1137 AESENC X10, X10 1138 AESENC X11, X11 1139 AESENC X12, X12 1140 AESENC X13, X13 1141 AESENC X14, X14 1142 AESENC X15, X15 1143 1144 // scramble state, xor in a block 1145 MOVOU (AX), X0 1146 MOVOU 16(AX), X1 1147 MOVOU 32(AX), X2 1148 MOVOU 48(AX), X3 1149 AESENC X0, X8 1150 AESENC X1, X9 1151 AESENC X2, X10 1152 AESENC X3, X11 1153 MOVOU 64(AX), X4 1154 MOVOU 80(AX), X5 1155 MOVOU 96(AX), X6 1156 MOVOU 112(AX), X7 1157 AESENC X4, X12 1158 AESENC X5, X13 1159 AESENC X6, X14 1160 AESENC X7, X15 1161 1162 ADDQ $128, AX 1163 DECQ CX 1164 JNE aesloop 1165 1166 // 3 more scrambles to finish 1167 AESENC X8, X8 1168 AESENC X9, X9 1169 AESENC X10, X10 1170 AESENC X11, X11 1171 AESENC X12, X12 1172 AESENC X13, X13 1173 AESENC X14, X14 1174 AESENC X15, X15 1175 AESENC X8, X8 1176 AESENC X9, X9 1177 AESENC X10, X10 1178 AESENC X11, X11 1179 AESENC X12, X12 1180 AESENC X13, X13 1181 AESENC X14, X14 1182 AESENC X15, X15 1183 AESENC X8, X8 1184 AESENC X9, X9 1185 AESENC X10, X10 1186 AESENC X11, X11 1187 AESENC X12, X12 1188 AESENC X13, X13 1189 AESENC X14, X14 1190 AESENC X15, X15 1191 1192 PXOR X12, X8 1193 PXOR X13, X9 1194 PXOR X14, X10 1195 PXOR X15, X11 1196 PXOR X10, X8 1197 PXOR X11, X9 1198 PXOR X9, X8 1199 MOVQ X8, (DX) 1200 RET 1201 1202 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1203 MOVQ p+0(FP), AX // ptr to data 1204 MOVQ h+8(FP), X0 // seed 1205 PINSRD $2, (AX), X0 // data 1206 AESENC runtime·aeskeysched+0(SB), X0 1207 AESENC runtime·aeskeysched+16(SB), X0 1208 AESENC runtime·aeskeysched+32(SB), X0 1209 MOVQ X0, ret+16(FP) 1210 RET 1211 1212 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1213 MOVQ p+0(FP), AX // ptr to data 1214 MOVQ h+8(FP), X0 // seed 1215 PINSRQ $1, (AX), X0 // data 1216 AESENC runtime·aeskeysched+0(SB), X0 1217 AESENC runtime·aeskeysched+16(SB), X0 1218 AESENC runtime·aeskeysched+32(SB), X0 1219 MOVQ X0, ret+16(FP) 1220 RET 1221 1222 // simple mask to get rid of data in the high part of the register. 1223 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1224 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1225 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1226 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1227 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1228 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1229 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1230 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1231 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1232 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1233 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1234 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1235 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1236 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1237 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1238 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1239 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1240 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1241 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1242 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1243 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1244 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1245 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1246 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1247 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1248 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1249 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1250 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1251 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1252 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1253 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1254 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1255 GLOBL masks<>(SB),RODATA,$256 1256 1257 TEXT ·checkASM(SB),NOSPLIT,$0-1 1258 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte 1259 MOVQ $masks<>(SB), AX 1260 MOVQ $shifts<>(SB), BX 1261 ORQ BX, AX 1262 TESTQ $15, AX 1263 SETEQ ret+0(FP) 1264 RET 1265 1266 // these are arguments to pshufb. They move data down from 1267 // the high bytes of the register to the low bytes of the register. 1268 // index is how many bytes to move. 1269 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1270 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1271 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1272 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1273 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1274 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1275 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1276 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1277 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1278 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1279 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1280 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1281 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1282 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1283 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1284 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1285 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1286 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1287 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1288 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1289 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1290 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1291 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1292 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1293 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1294 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1295 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1296 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1297 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1298 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1299 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1300 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1301 GLOBL shifts<>(SB),RODATA,$256 1302 1303 // memequal(p, q unsafe.Pointer, size uintptr) bool 1304 TEXT runtime·memequal(SB),NOSPLIT,$0-25 1305 MOVQ a+0(FP), SI 1306 MOVQ b+8(FP), DI 1307 CMPQ SI, DI 1308 JEQ eq 1309 MOVQ size+16(FP), BX 1310 LEAQ ret+24(FP), AX 1311 JMP runtime·memeqbody(SB) 1312 eq: 1313 MOVB $1, ret+24(FP) 1314 RET 1315 1316 // memequal_varlen(a, b unsafe.Pointer) bool 1317 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1318 MOVQ a+0(FP), SI 1319 MOVQ b+8(FP), DI 1320 CMPQ SI, DI 1321 JEQ eq 1322 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1323 LEAQ ret+16(FP), AX 1324 JMP runtime·memeqbody(SB) 1325 eq: 1326 MOVB $1, ret+16(FP) 1327 RET 1328 1329 // eqstring tests whether two strings are equal. 1330 // The compiler guarantees that strings passed 1331 // to eqstring have equal length. 1332 // See runtime_test.go:eqstring_generic for 1333 // equivalent Go code. 1334 TEXT runtime·eqstring(SB),NOSPLIT,$0-33 1335 MOVQ s1_base+0(FP), SI 1336 MOVQ s2_base+16(FP), DI 1337 CMPQ SI, DI 1338 JEQ eq 1339 MOVQ s1_len+8(FP), BX 1340 LEAQ ret+32(FP), AX 1341 JMP runtime·memeqbody(SB) 1342 eq: 1343 MOVB $1, ret+32(FP) 1344 RET 1345 1346 // a in SI 1347 // b in DI 1348 // count in BX 1349 // address of result byte in AX 1350 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1351 CMPQ BX, $8 1352 JB small 1353 CMPQ BX, $64 1354 JB bigloop 1355 CMPB runtime·support_avx2(SB), $1 1356 JE hugeloop_avx2 1357 1358 // 64 bytes at a time using xmm registers 1359 hugeloop: 1360 CMPQ BX, $64 1361 JB bigloop 1362 MOVOU (SI), X0 1363 MOVOU (DI), X1 1364 MOVOU 16(SI), X2 1365 MOVOU 16(DI), X3 1366 MOVOU 32(SI), X4 1367 MOVOU 32(DI), X5 1368 MOVOU 48(SI), X6 1369 MOVOU 48(DI), X7 1370 PCMPEQB X1, X0 1371 PCMPEQB X3, X2 1372 PCMPEQB X5, X4 1373 PCMPEQB X7, X6 1374 PAND X2, X0 1375 PAND X6, X4 1376 PAND X4, X0 1377 PMOVMSKB X0, DX 1378 ADDQ $64, SI 1379 ADDQ $64, DI 1380 SUBQ $64, BX 1381 CMPL DX, $0xffff 1382 JEQ hugeloop 1383 MOVB $0, (AX) 1384 RET 1385 1386 // 64 bytes at a time using ymm registers 1387 hugeloop_avx2: 1388 CMPQ BX, $64 1389 JB bigloop_avx2 1390 VMOVDQU (SI), Y0 1391 VMOVDQU (DI), Y1 1392 VMOVDQU 32(SI), Y2 1393 VMOVDQU 32(DI), Y3 1394 VPCMPEQB Y1, Y0, Y4 1395 VPCMPEQB Y2, Y3, Y5 1396 VPAND Y4, Y5, Y6 1397 VPMOVMSKB Y6, DX 1398 ADDQ $64, SI 1399 ADDQ $64, DI 1400 SUBQ $64, BX 1401 CMPL DX, $0xffffffff 1402 JEQ hugeloop_avx2 1403 VZEROUPPER 1404 MOVB $0, (AX) 1405 RET 1406 1407 bigloop_avx2: 1408 VZEROUPPER 1409 1410 // 8 bytes at a time using 64-bit register 1411 bigloop: 1412 CMPQ BX, $8 1413 JBE leftover 1414 MOVQ (SI), CX 1415 MOVQ (DI), DX 1416 ADDQ $8, SI 1417 ADDQ $8, DI 1418 SUBQ $8, BX 1419 CMPQ CX, DX 1420 JEQ bigloop 1421 MOVB $0, (AX) 1422 RET 1423 1424 // remaining 0-8 bytes 1425 leftover: 1426 MOVQ -8(SI)(BX*1), CX 1427 MOVQ -8(DI)(BX*1), DX 1428 CMPQ CX, DX 1429 SETEQ (AX) 1430 RET 1431 1432 small: 1433 CMPQ BX, $0 1434 JEQ equal 1435 1436 LEAQ 0(BX*8), CX 1437 NEGQ CX 1438 1439 CMPB SI, $0xf8 1440 JA si_high 1441 1442 // load at SI won't cross a page boundary. 1443 MOVQ (SI), SI 1444 JMP si_finish 1445 si_high: 1446 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1447 MOVQ -8(SI)(BX*1), SI 1448 SHRQ CX, SI 1449 si_finish: 1450 1451 // same for DI. 1452 CMPB DI, $0xf8 1453 JA di_high 1454 MOVQ (DI), DI 1455 JMP di_finish 1456 di_high: 1457 MOVQ -8(DI)(BX*1), DI 1458 SHRQ CX, DI 1459 di_finish: 1460 1461 SUBQ SI, DI 1462 SHLQ CX, DI 1463 equal: 1464 SETEQ (AX) 1465 RET 1466 1467 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1468 MOVQ s1_base+0(FP), SI 1469 MOVQ s1_len+8(FP), BX 1470 MOVQ s2_base+16(FP), DI 1471 MOVQ s2_len+24(FP), DX 1472 LEAQ ret+32(FP), R9 1473 JMP runtime·cmpbody(SB) 1474 1475 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1476 MOVQ s1+0(FP), SI 1477 MOVQ s1+8(FP), BX 1478 MOVQ s2+24(FP), DI 1479 MOVQ s2+32(FP), DX 1480 LEAQ res+48(FP), R9 1481 JMP runtime·cmpbody(SB) 1482 1483 // input: 1484 // SI = a 1485 // DI = b 1486 // BX = alen 1487 // DX = blen 1488 // R9 = address of output word (stores -1/0/1 here) 1489 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1490 CMPQ SI, DI 1491 JEQ allsame 1492 CMPQ BX, DX 1493 MOVQ DX, R8 1494 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1495 CMPQ R8, $8 1496 JB small 1497 1498 CMPQ R8, $63 1499 JBE loop 1500 CMPB runtime·support_avx2(SB), $1 1501 JEQ big_loop_avx2 1502 JMP big_loop 1503 loop: 1504 CMPQ R8, $16 1505 JBE _0through16 1506 MOVOU (SI), X0 1507 MOVOU (DI), X1 1508 PCMPEQB X0, X1 1509 PMOVMSKB X1, AX 1510 XORQ $0xffff, AX // convert EQ to NE 1511 JNE diff16 // branch if at least one byte is not equal 1512 ADDQ $16, SI 1513 ADDQ $16, DI 1514 SUBQ $16, R8 1515 JMP loop 1516 1517 diff64: 1518 ADDQ $48, SI 1519 ADDQ $48, DI 1520 JMP diff16 1521 diff48: 1522 ADDQ $32, SI 1523 ADDQ $32, DI 1524 JMP diff16 1525 diff32: 1526 ADDQ $16, SI 1527 ADDQ $16, DI 1528 // AX = bit mask of differences 1529 diff16: 1530 BSFQ AX, BX // index of first byte that differs 1531 XORQ AX, AX 1532 MOVB (SI)(BX*1), CX 1533 CMPB CX, (DI)(BX*1) 1534 SETHI AX 1535 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1536 MOVQ AX, (R9) 1537 RET 1538 1539 // 0 through 16 bytes left, alen>=8, blen>=8 1540 _0through16: 1541 CMPQ R8, $8 1542 JBE _0through8 1543 MOVQ (SI), AX 1544 MOVQ (DI), CX 1545 CMPQ AX, CX 1546 JNE diff8 1547 _0through8: 1548 MOVQ -8(SI)(R8*1), AX 1549 MOVQ -8(DI)(R8*1), CX 1550 CMPQ AX, CX 1551 JEQ allsame 1552 1553 // AX and CX contain parts of a and b that differ. 1554 diff8: 1555 BSWAPQ AX // reverse order of bytes 1556 BSWAPQ CX 1557 XORQ AX, CX 1558 BSRQ CX, CX // index of highest bit difference 1559 SHRQ CX, AX // move a's bit to bottom 1560 ANDQ $1, AX // mask bit 1561 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1562 MOVQ AX, (R9) 1563 RET 1564 1565 // 0-7 bytes in common 1566 small: 1567 LEAQ (R8*8), CX // bytes left -> bits left 1568 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1569 JEQ allsame 1570 1571 // load bytes of a into high bytes of AX 1572 CMPB SI, $0xf8 1573 JA si_high 1574 MOVQ (SI), SI 1575 JMP si_finish 1576 si_high: 1577 MOVQ -8(SI)(R8*1), SI 1578 SHRQ CX, SI 1579 si_finish: 1580 SHLQ CX, SI 1581 1582 // load bytes of b in to high bytes of BX 1583 CMPB DI, $0xf8 1584 JA di_high 1585 MOVQ (DI), DI 1586 JMP di_finish 1587 di_high: 1588 MOVQ -8(DI)(R8*1), DI 1589 SHRQ CX, DI 1590 di_finish: 1591 SHLQ CX, DI 1592 1593 BSWAPQ SI // reverse order of bytes 1594 BSWAPQ DI 1595 XORQ SI, DI // find bit differences 1596 JEQ allsame 1597 BSRQ DI, CX // index of highest bit difference 1598 SHRQ CX, SI // move a's bit to bottom 1599 ANDQ $1, SI // mask bit 1600 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1601 MOVQ AX, (R9) 1602 RET 1603 1604 allsame: 1605 XORQ AX, AX 1606 XORQ CX, CX 1607 CMPQ BX, DX 1608 SETGT AX // 1 if alen > blen 1609 SETEQ CX // 1 if alen == blen 1610 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1611 MOVQ AX, (R9) 1612 RET 1613 1614 // this works for >= 64 bytes of data. 1615 big_loop: 1616 MOVOU (SI), X0 1617 MOVOU (DI), X1 1618 PCMPEQB X0, X1 1619 PMOVMSKB X1, AX 1620 XORQ $0xffff, AX 1621 JNE diff16 1622 1623 MOVOU 16(SI), X0 1624 MOVOU 16(DI), X1 1625 PCMPEQB X0, X1 1626 PMOVMSKB X1, AX 1627 XORQ $0xffff, AX 1628 JNE diff32 1629 1630 MOVOU 32(SI), X0 1631 MOVOU 32(DI), X1 1632 PCMPEQB X0, X1 1633 PMOVMSKB X1, AX 1634 XORQ $0xffff, AX 1635 JNE diff48 1636 1637 MOVOU 48(SI), X0 1638 MOVOU 48(DI), X1 1639 PCMPEQB X0, X1 1640 PMOVMSKB X1, AX 1641 XORQ $0xffff, AX 1642 JNE diff64 1643 1644 ADDQ $64, SI 1645 ADDQ $64, DI 1646 SUBQ $64, R8 1647 CMPQ R8, $64 1648 JBE loop 1649 JMP big_loop 1650 1651 // Compare 64-bytes per loop iteration. 1652 // Loop is unrolled and uses AVX2. 1653 big_loop_avx2: 1654 VMOVDQU (SI), Y2 1655 VMOVDQU (DI), Y3 1656 VMOVDQU 32(SI), Y4 1657 VMOVDQU 32(DI), Y5 1658 VPCMPEQB Y2, Y3, Y0 1659 VPMOVMSKB Y0, AX 1660 XORL $0xffffffff, AX 1661 JNE diff32_avx2 1662 VPCMPEQB Y4, Y5, Y6 1663 VPMOVMSKB Y6, AX 1664 XORL $0xffffffff, AX 1665 JNE diff64_avx2 1666 1667 ADDQ $64, SI 1668 ADDQ $64, DI 1669 SUBQ $64, R8 1670 CMPQ R8, $64 1671 JB big_loop_avx2_exit 1672 JMP big_loop_avx2 1673 1674 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 1675 diff32_avx2: 1676 VZEROUPPER 1677 JMP diff16 1678 1679 // Same as diff32_avx2, but for last 32 bytes. 1680 diff64_avx2: 1681 VZEROUPPER 1682 JMP diff48 1683 1684 // For <64 bytes remainder jump to normal loop. 1685 big_loop_avx2_exit: 1686 VZEROUPPER 1687 JMP loop 1688 1689 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 1690 MOVQ s+0(FP), DI 1691 // We want len in DX and AX, because PCMPESTRI implicitly consumes them 1692 MOVQ s_len+8(FP), DX 1693 MOVQ c+16(FP), BP 1694 MOVQ c_len+24(FP), AX 1695 MOVQ DI, R10 1696 LEAQ ret+32(FP), R11 1697 JMP runtime·indexShortStr(SB) 1698 1699 TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56 1700 MOVQ s+0(FP), DI 1701 MOVQ s_len+8(FP), DX 1702 MOVQ c+24(FP), BP 1703 MOVQ c_len+32(FP), AX 1704 MOVQ DI, R10 1705 LEAQ ret+48(FP), R11 1706 JMP runtime·indexShortStr(SB) 1707 1708 // AX: length of string, that we are searching for 1709 // DX: length of string, in which we are searching 1710 // DI: pointer to string, in which we are searching 1711 // BP: pointer to string, that we are searching for 1712 // R11: address, where to put return value 1713 TEXT runtime·indexShortStr(SB),NOSPLIT,$0 1714 CMPQ AX, DX 1715 JA fail 1716 CMPQ DX, $16 1717 JAE sse42 1718 no_sse42: 1719 CMPQ AX, $2 1720 JA _3_or_more 1721 MOVW (BP), BP 1722 LEAQ -1(DI)(DX*1), DX 1723 loop2: 1724 MOVW (DI), SI 1725 CMPW SI,BP 1726 JZ success 1727 ADDQ $1,DI 1728 CMPQ DI,DX 1729 JB loop2 1730 JMP fail 1731 _3_or_more: 1732 CMPQ AX, $3 1733 JA _4_or_more 1734 MOVW 1(BP), BX 1735 MOVW (BP), BP 1736 LEAQ -2(DI)(DX*1), DX 1737 loop3: 1738 MOVW (DI), SI 1739 CMPW SI,BP 1740 JZ partial_success3 1741 ADDQ $1,DI 1742 CMPQ DI,DX 1743 JB loop3 1744 JMP fail 1745 partial_success3: 1746 MOVW 1(DI), SI 1747 CMPW SI,BX 1748 JZ success 1749 ADDQ $1,DI 1750 CMPQ DI,DX 1751 JB loop3 1752 JMP fail 1753 _4_or_more: 1754 CMPQ AX, $4 1755 JA _5_or_more 1756 MOVL (BP), BP 1757 LEAQ -3(DI)(DX*1), DX 1758 loop4: 1759 MOVL (DI), SI 1760 CMPL SI,BP 1761 JZ success 1762 ADDQ $1,DI 1763 CMPQ DI,DX 1764 JB loop4 1765 JMP fail 1766 _5_or_more: 1767 CMPQ AX, $7 1768 JA _8_or_more 1769 LEAQ 1(DI)(DX*1), DX 1770 SUBQ AX, DX 1771 MOVL -4(BP)(AX*1), BX 1772 MOVL (BP), BP 1773 loop5to7: 1774 MOVL (DI), SI 1775 CMPL SI,BP 1776 JZ partial_success5to7 1777 ADDQ $1,DI 1778 CMPQ DI,DX 1779 JB loop5to7 1780 JMP fail 1781 partial_success5to7: 1782 MOVL -4(AX)(DI*1), SI 1783 CMPL SI,BX 1784 JZ success 1785 ADDQ $1,DI 1786 CMPQ DI,DX 1787 JB loop5to7 1788 JMP fail 1789 _8_or_more: 1790 CMPQ AX, $8 1791 JA _9_or_more 1792 MOVQ (BP), BP 1793 LEAQ -7(DI)(DX*1), DX 1794 loop8: 1795 MOVQ (DI), SI 1796 CMPQ SI,BP 1797 JZ success 1798 ADDQ $1,DI 1799 CMPQ DI,DX 1800 JB loop8 1801 JMP fail 1802 _9_or_more: 1803 CMPQ AX, $15 1804 JA _16_or_more 1805 LEAQ 1(DI)(DX*1), DX 1806 SUBQ AX, DX 1807 MOVQ -8(BP)(AX*1), BX 1808 MOVQ (BP), BP 1809 loop9to15: 1810 MOVQ (DI), SI 1811 CMPQ SI,BP 1812 JZ partial_success9to15 1813 ADDQ $1,DI 1814 CMPQ DI,DX 1815 JB loop9to15 1816 JMP fail 1817 partial_success9to15: 1818 MOVQ -8(AX)(DI*1), SI 1819 CMPQ SI,BX 1820 JZ success 1821 ADDQ $1,DI 1822 CMPQ DI,DX 1823 JB loop9to15 1824 JMP fail 1825 _16_or_more: 1826 CMPQ AX, $16 1827 JA _17_or_more 1828 MOVOU (BP), X1 1829 LEAQ -15(DI)(DX*1), DX 1830 loop16: 1831 MOVOU (DI), X2 1832 PCMPEQB X1, X2 1833 PMOVMSKB X2, SI 1834 CMPQ SI, $0xffff 1835 JE success 1836 ADDQ $1,DI 1837 CMPQ DI,DX 1838 JB loop16 1839 JMP fail 1840 _17_or_more: 1841 CMPQ AX, $31 1842 JA _32_or_more 1843 LEAQ 1(DI)(DX*1), DX 1844 SUBQ AX, DX 1845 MOVOU -16(BP)(AX*1), X0 1846 MOVOU (BP), X1 1847 loop17to31: 1848 MOVOU (DI), X2 1849 PCMPEQB X1,X2 1850 PMOVMSKB X2, SI 1851 CMPQ SI, $0xffff 1852 JE partial_success17to31 1853 ADDQ $1,DI 1854 CMPQ DI,DX 1855 JB loop17to31 1856 JMP fail 1857 partial_success17to31: 1858 MOVOU -16(AX)(DI*1), X3 1859 PCMPEQB X0, X3 1860 PMOVMSKB X3, SI 1861 CMPQ SI, $0xffff 1862 JE success 1863 ADDQ $1,DI 1864 CMPQ DI,DX 1865 JB loop17to31 1866 JMP fail 1867 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 1868 // So no need to check cpuid 1869 _32_or_more: 1870 CMPQ AX, $32 1871 JA _33_to_63 1872 VMOVDQU (BP), Y1 1873 LEAQ -31(DI)(DX*1), DX 1874 loop32: 1875 VMOVDQU (DI), Y2 1876 VPCMPEQB Y1, Y2, Y3 1877 VPMOVMSKB Y3, SI 1878 CMPL SI, $0xffffffff 1879 JE success_avx2 1880 ADDQ $1,DI 1881 CMPQ DI,DX 1882 JB loop32 1883 JMP fail_avx2 1884 _33_to_63: 1885 LEAQ 1(DI)(DX*1), DX 1886 SUBQ AX, DX 1887 VMOVDQU -32(BP)(AX*1), Y0 1888 VMOVDQU (BP), Y1 1889 loop33to63: 1890 VMOVDQU (DI), Y2 1891 VPCMPEQB Y1, Y2, Y3 1892 VPMOVMSKB Y3, SI 1893 CMPL SI, $0xffffffff 1894 JE partial_success33to63 1895 ADDQ $1,DI 1896 CMPQ DI,DX 1897 JB loop33to63 1898 JMP fail_avx2 1899 partial_success33to63: 1900 VMOVDQU -32(AX)(DI*1), Y3 1901 VPCMPEQB Y0, Y3, Y4 1902 VPMOVMSKB Y4, SI 1903 CMPL SI, $0xffffffff 1904 JE success_avx2 1905 ADDQ $1,DI 1906 CMPQ DI,DX 1907 JB loop33to63 1908 fail_avx2: 1909 VZEROUPPER 1910 fail: 1911 MOVQ $-1, (R11) 1912 RET 1913 success_avx2: 1914 VZEROUPPER 1915 JMP success 1916 sse42: 1917 CMPB runtime·support_sse42(SB), $1 1918 JNE no_sse42 1919 CMPQ AX, $12 1920 // PCMPESTRI is slower than normal compare, 1921 // so using it makes sense only if we advance 4+ bytes per compare 1922 // This value was determined experimentally and is the ~same 1923 // on Nehalem (first with SSE42) and Haswell. 1924 JAE _9_or_more 1925 LEAQ 16(BP), SI 1926 TESTW $0xff0, SI 1927 JEQ no_sse42 1928 MOVOU (BP), X1 1929 LEAQ -15(DI)(DX*1), SI 1930 MOVQ $16, R9 1931 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 1932 loop_sse42: 1933 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 1934 // for equality (bits 2,3 are 11) 1935 // result is not masked or inverted (bits 4,5 are 00) 1936 // and corresponds to first matching byte (bit 6 is 0) 1937 PCMPESTRI $0x0c, (DI), X1 1938 // CX == 16 means no match, 1939 // CX > R9 means partial match at the end of the string, 1940 // otherwise sep is at offset CX from X1 start 1941 CMPQ CX, R9 1942 JBE sse42_success 1943 ADDQ R9, DI 1944 CMPQ DI, SI 1945 JB loop_sse42 1946 PCMPESTRI $0x0c, -1(SI), X1 1947 CMPQ CX, R9 1948 JA fail 1949 LEAQ -1(SI), DI 1950 sse42_success: 1951 ADDQ CX, DI 1952 success: 1953 SUBQ R10, DI 1954 MOVQ DI, (R11) 1955 RET 1956 1957 1958 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 1959 MOVQ s+0(FP), SI 1960 MOVQ s_len+8(FP), BX 1961 MOVB c+24(FP), AL 1962 LEAQ ret+32(FP), R8 1963 JMP runtime·indexbytebody(SB) 1964 1965 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 1966 MOVQ s+0(FP), SI 1967 MOVQ s_len+8(FP), BX 1968 MOVB c+16(FP), AL 1969 LEAQ ret+24(FP), R8 1970 JMP runtime·indexbytebody(SB) 1971 1972 // input: 1973 // SI: data 1974 // BX: data len 1975 // AL: byte sought 1976 // R8: address to put result 1977 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 1978 // Shuffle X0 around so that each byte contains 1979 // the character we're looking for. 1980 MOVD AX, X0 1981 PUNPCKLBW X0, X0 1982 PUNPCKLBW X0, X0 1983 PSHUFL $0, X0, X0 1984 1985 CMPQ BX, $16 1986 JLT small 1987 1988 MOVQ SI, DI 1989 1990 CMPQ BX, $32 1991 JA avx2 1992 sse: 1993 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 1994 JMP sseloopentry 1995 1996 sseloop: 1997 // Move the next 16-byte chunk of the data into X1. 1998 MOVOU (DI), X1 1999 // Compare bytes in X0 to X1. 2000 PCMPEQB X0, X1 2001 // Take the top bit of each byte in X1 and put the result in DX. 2002 PMOVMSKB X1, DX 2003 // Find first set bit, if any. 2004 BSFL DX, DX 2005 JNZ ssesuccess 2006 // Advance to next block. 2007 ADDQ $16, DI 2008 sseloopentry: 2009 CMPQ DI, AX 2010 JB sseloop 2011 2012 // Search the last 16-byte chunk. This chunk may overlap with the 2013 // chunks we've already searched, but that's ok. 2014 MOVQ AX, DI 2015 MOVOU (AX), X1 2016 PCMPEQB X0, X1 2017 PMOVMSKB X1, DX 2018 BSFL DX, DX 2019 JNZ ssesuccess 2020 2021 failure: 2022 MOVQ $-1, (R8) 2023 RET 2024 2025 // We've found a chunk containing the byte. 2026 // The chunk was loaded from DI. 2027 // The index of the matching byte in the chunk is DX. 2028 // The start of the data is SI. 2029 ssesuccess: 2030 SUBQ SI, DI // Compute offset of chunk within data. 2031 ADDQ DX, DI // Add offset of byte within chunk. 2032 MOVQ DI, (R8) 2033 RET 2034 2035 // handle for lengths < 16 2036 small: 2037 TESTQ BX, BX 2038 JEQ failure 2039 2040 // Check if we'll load across a page boundary. 2041 LEAQ 16(SI), AX 2042 TESTW $0xff0, AX 2043 JEQ endofpage 2044 2045 MOVOU (SI), X1 // Load data 2046 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2047 PMOVMSKB X1, DX // Move result bits to integer register. 2048 BSFL DX, DX // Find first set bit. 2049 JZ failure // No set bit, failure. 2050 CMPL DX, BX 2051 JAE failure // Match is past end of data. 2052 MOVQ DX, (R8) 2053 RET 2054 2055 endofpage: 2056 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 2057 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2058 PMOVMSKB X1, DX // Move result bits to integer register. 2059 MOVL BX, CX 2060 SHLL CX, DX 2061 SHRL $16, DX // Shift desired bits down to bottom of register. 2062 BSFL DX, DX // Find first set bit. 2063 JZ failure // No set bit, failure. 2064 MOVQ DX, (R8) 2065 RET 2066 2067 avx2: 2068 CMPB runtime·support_avx2(SB), $1 2069 JNE sse 2070 MOVD AX, X0 2071 LEAQ -32(SI)(BX*1), R11 2072 VPBROADCASTB X0, Y1 2073 avx2_loop: 2074 VMOVDQU (DI), Y2 2075 VPCMPEQB Y1, Y2, Y3 2076 VPTEST Y3, Y3 2077 JNZ avx2success 2078 ADDQ $32, DI 2079 CMPQ DI, R11 2080 JLT avx2_loop 2081 MOVQ R11, DI 2082 VMOVDQU (DI), Y2 2083 VPCMPEQB Y1, Y2, Y3 2084 VPTEST Y3, Y3 2085 JNZ avx2success 2086 VZEROUPPER 2087 MOVQ $-1, (R8) 2088 RET 2089 2090 avx2success: 2091 VPMOVMSKB Y3, DX 2092 BSFL DX, DX 2093 SUBQ SI, DI 2094 ADDQ DI, DX 2095 MOVQ DX, (R8) 2096 VZEROUPPER 2097 RET 2098 2099 TEXT bytes·Equal(SB),NOSPLIT,$0-49 2100 MOVQ a_len+8(FP), BX 2101 MOVQ b_len+32(FP), CX 2102 CMPQ BX, CX 2103 JNE eqret 2104 MOVQ a+0(FP), SI 2105 MOVQ b+24(FP), DI 2106 LEAQ ret+48(FP), AX 2107 JMP runtime·memeqbody(SB) 2108 eqret: 2109 MOVB $0, ret+48(FP) 2110 RET 2111 2112 2113 TEXT bytes·countByte(SB),NOSPLIT,$0-40 2114 MOVQ s+0(FP), SI 2115 MOVQ s_len+8(FP), BX 2116 MOVB c+24(FP), AL 2117 LEAQ ret+32(FP), R8 2118 JMP runtime·countByte(SB) 2119 2120 TEXT strings·countByte(SB),NOSPLIT,$0-32 2121 MOVQ s+0(FP), SI 2122 MOVQ s_len+8(FP), BX 2123 MOVB c+16(FP), AL 2124 LEAQ ret+24(FP), R8 2125 JMP runtime·countByte(SB) 2126 2127 // input: 2128 // SI: data 2129 // BX: data len 2130 // AL: byte sought 2131 // R8: address to put result 2132 // This requires the POPCNT instruction 2133 TEXT runtime·countByte(SB),NOSPLIT,$0 2134 // Shuffle X0 around so that each byte contains 2135 // the character we're looking for. 2136 MOVD AX, X0 2137 PUNPCKLBW X0, X0 2138 PUNPCKLBW X0, X0 2139 PSHUFL $0, X0, X0 2140 2141 CMPQ BX, $16 2142 JLT small 2143 2144 MOVQ $0, R12 // Accumulator 2145 2146 MOVQ SI, DI 2147 2148 CMPQ BX, $32 2149 JA avx2 2150 sse: 2151 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 2152 JMP sseloopentry 2153 2154 sseloop: 2155 // Move the next 16-byte chunk of the data into X1. 2156 MOVOU (DI), X1 2157 // Compare bytes in X0 to X1. 2158 PCMPEQB X0, X1 2159 // Take the top bit of each byte in X1 and put the result in DX. 2160 PMOVMSKB X1, DX 2161 // Count number of matching bytes 2162 POPCNTL DX, DX 2163 // Accumulate into R12 2164 ADDQ DX, R12 2165 // Advance to next block. 2166 ADDQ $16, DI 2167 sseloopentry: 2168 CMPQ DI, AX 2169 JBE sseloop 2170 2171 // Get the number of bytes to consider in the last 16 bytes 2172 ANDQ $15, BX 2173 JZ end 2174 2175 // Create mask to ignore overlap between previous 16 byte block 2176 // and the next. 2177 MOVQ $16,CX 2178 SUBQ BX, CX 2179 MOVQ $0xFFFF, R10 2180 SARQ CL, R10 2181 SALQ CL, R10 2182 2183 // Process the last 16-byte chunk. This chunk may overlap with the 2184 // chunks we've already searched so we need to mask part of it. 2185 MOVOU (AX), X1 2186 PCMPEQB X0, X1 2187 PMOVMSKB X1, DX 2188 // Apply mask 2189 ANDQ R10, DX 2190 POPCNTL DX, DX 2191 ADDQ DX, R12 2192 end: 2193 MOVQ R12, (R8) 2194 RET 2195 2196 // handle for lengths < 16 2197 small: 2198 TESTQ BX, BX 2199 JEQ endzero 2200 2201 // Check if we'll load across a page boundary. 2202 LEAQ 16(SI), AX 2203 TESTW $0xff0, AX 2204 JEQ endofpage 2205 2206 // We must ignore high bytes as they aren't part of our slice. 2207 // Create mask. 2208 MOVB BX, CX 2209 MOVQ $1, R10 2210 SALQ CL, R10 2211 SUBQ $1, R10 2212 2213 // Load data 2214 MOVOU (SI), X1 2215 // Compare target byte with each byte in data. 2216 PCMPEQB X0, X1 2217 // Move result bits to integer register. 2218 PMOVMSKB X1, DX 2219 // Apply mask 2220 ANDQ R10, DX 2221 POPCNTL DX, DX 2222 // Directly return DX, we don't need to accumulate 2223 // since we have <16 bytes. 2224 MOVQ DX, (R8) 2225 RET 2226 endzero: 2227 MOVQ $0, (R8) 2228 RET 2229 2230 endofpage: 2231 // We must ignore low bytes as they aren't part of our slice. 2232 MOVQ $16,CX 2233 SUBQ BX, CX 2234 MOVQ $0xFFFF, R10 2235 SARQ CL, R10 2236 SALQ CL, R10 2237 2238 // Load data into the high end of X1. 2239 MOVOU -16(SI)(BX*1), X1 2240 // Compare target byte with each byte in data. 2241 PCMPEQB X0, X1 2242 // Move result bits to integer register. 2243 PMOVMSKB X1, DX 2244 // Apply mask 2245 ANDQ R10, DX 2246 // Directly return DX, we don't need to accumulate 2247 // since we have <16 bytes. 2248 POPCNTL DX, DX 2249 MOVQ DX, (R8) 2250 RET 2251 2252 avx2: 2253 CMPB runtime·support_avx2(SB), $1 2254 JNE sse 2255 MOVD AX, X0 2256 LEAQ -32(SI)(BX*1), R11 2257 VPBROADCASTB X0, Y1 2258 avx2_loop: 2259 VMOVDQU (DI), Y2 2260 VPCMPEQB Y1, Y2, Y3 2261 VPMOVMSKB Y3, DX 2262 POPCNTL DX, DX 2263 ADDQ DX, R12 2264 ADDQ $32, DI 2265 CMPQ DI, R11 2266 JLE avx2_loop 2267 2268 // If last block is already processed, 2269 // skip to the end. 2270 CMPQ DI, R11 2271 JEQ endavx 2272 2273 // Load address of the last 32 bytes. 2274 // There is an overlap with the previous block. 2275 MOVQ R11, DI 2276 VMOVDQU (DI), Y2 2277 VPCMPEQB Y1, Y2, Y3 2278 VPMOVMSKB Y3, DX 2279 // Exit AVX mode. 2280 VZEROUPPER 2281 2282 // Create mask to ignore overlap between previous 32 byte block 2283 // and the next. 2284 ANDQ $31, BX 2285 MOVQ $32,CX 2286 SUBQ BX, CX 2287 MOVQ $0xFFFFFFFF, R10 2288 SARQ CL, R10 2289 SALQ CL, R10 2290 // Apply mask 2291 ANDQ R10, DX 2292 POPCNTL DX, DX 2293 ADDQ DX, R12 2294 MOVQ R12, (R8) 2295 RET 2296 endavx: 2297 // Exit AVX mode. 2298 VZEROUPPER 2299 MOVQ R12, (R8) 2300 RET 2301 2302 TEXT runtime·return0(SB), NOSPLIT, $0 2303 MOVL $0, AX 2304 RET 2305 2306 2307 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 2308 // Must obey the gcc calling convention. 2309 TEXT _cgo_topofstack(SB),NOSPLIT,$0 2310 get_tls(CX) 2311 MOVQ g(CX), AX 2312 MOVQ g_m(AX), AX 2313 MOVQ m_curg(AX), AX 2314 MOVQ (g_stack+stack_hi)(AX), AX 2315 RET 2316 2317 // The top-most function running on a goroutine 2318 // returns to goexit+PCQuantum. 2319 TEXT runtime·goexit(SB),NOSPLIT,$0-0 2320 BYTE $0x90 // NOP 2321 CALL runtime·goexit1(SB) // does not return 2322 // traceback from goexit1 must hit code range of goexit 2323 BYTE $0x90 // NOP 2324 2325 // This is called from .init_array and follows the platform, not Go, ABI. 2326 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 2327 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 2328 MOVQ runtime·lastmoduledatap(SB), AX 2329 MOVQ DI, moduledata_next(AX) 2330 MOVQ DI, runtime·lastmoduledatap(SB) 2331 POPQ R15 2332 RET