github.com/filosottile/go@v0.0.0-20170906193555-dbed9972d994/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVL $0, AX 30 CPUID 31 MOVL AX, SI 32 CMPL AX, $0 33 JE nocpuinfo 34 35 // Figure out how to serialize RDTSC. 36 // On Intel processors LFENCE is enough. AMD requires MFENCE. 37 // Don't know about the rest, so let's do MFENCE. 38 CMPL BX, $0x756E6547 // "Genu" 39 JNE notintel 40 CMPL DX, $0x49656E69 // "ineI" 41 JNE notintel 42 CMPL CX, $0x6C65746E // "ntel" 43 JNE notintel 44 MOVB $1, runtime·isIntel(SB) 45 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 46 notintel: 47 48 // Load EAX=1 cpuid flags 49 MOVL $1, AX 50 CPUID 51 MOVL AX, runtime·processorVersionInfo(SB) 52 53 TESTL $(1<<26), DX // SSE2 54 SETNE runtime·support_sse2(SB) 55 56 TESTL $(1<<9), CX // SSSE3 57 SETNE runtime·support_ssse3(SB) 58 59 TESTL $(1<<19), CX // SSE4.1 60 SETNE runtime·support_sse41(SB) 61 62 TESTL $(1<<20), CX // SSE4.2 63 SETNE runtime·support_sse42(SB) 64 65 TESTL $(1<<23), CX // POPCNT 66 SETNE runtime·support_popcnt(SB) 67 68 TESTL $(1<<25), CX // AES 69 SETNE runtime·support_aes(SB) 70 71 TESTL $(1<<27), CX // OSXSAVE 72 SETNE runtime·support_osxsave(SB) 73 74 // If OS support for XMM and YMM is not present 75 // support_avx will be set back to false later. 76 TESTL $(1<<28), CX // AVX 77 SETNE runtime·support_avx(SB) 78 79 eax7: 80 // Load EAX=7/ECX=0 cpuid flags 81 CMPL SI, $7 82 JLT osavx 83 MOVL $7, AX 84 MOVL $0, CX 85 CPUID 86 87 TESTL $(1<<3), BX // BMI1 88 SETNE runtime·support_bmi1(SB) 89 90 // If OS support for XMM and YMM is not present 91 // support_avx2 will be set back to false later. 92 TESTL $(1<<5), BX 93 SETNE runtime·support_avx2(SB) 94 95 TESTL $(1<<8), BX // BMI2 96 SETNE runtime·support_bmi2(SB) 97 98 TESTL $(1<<9), BX // ERMS 99 SETNE runtime·support_erms(SB) 100 101 osavx: 102 CMPB runtime·support_osxsave(SB), $1 103 JNE noavx 104 MOVL $0, CX 105 // For XGETBV, OSXSAVE bit is required and sufficient 106 XGETBV 107 ANDL $6, AX 108 CMPL AX, $6 // Check for OS support of XMM and YMM registers. 109 JE nocpuinfo 110 noavx: 111 MOVB $0, runtime·support_avx(SB) 112 MOVB $0, runtime·support_avx2(SB) 113 114 nocpuinfo: 115 // if there is an _cgo_init, call it. 116 MOVQ _cgo_init(SB), AX 117 TESTQ AX, AX 118 JZ needtls 119 // g0 already in DI 120 MOVQ DI, CX // Win64 uses CX for first parameter 121 MOVQ $setg_gcc<>(SB), SI 122 CALL AX 123 124 // update stackguard after _cgo_init 125 MOVQ $runtime·g0(SB), CX 126 MOVQ (g_stack+stack_lo)(CX), AX 127 ADDQ $const__StackGuard, AX 128 MOVQ AX, g_stackguard0(CX) 129 MOVQ AX, g_stackguard1(CX) 130 131 #ifndef GOOS_windows 132 JMP ok 133 #endif 134 needtls: 135 #ifdef GOOS_plan9 136 // skip TLS setup on Plan 9 137 JMP ok 138 #endif 139 #ifdef GOOS_solaris 140 // skip TLS setup on Solaris 141 JMP ok 142 #endif 143 144 LEAQ runtime·m0+m_tls(SB), DI 145 CALL runtime·settls(SB) 146 147 // store through it, to make sure it works 148 get_tls(BX) 149 MOVQ $0x123, g(BX) 150 MOVQ runtime·m0+m_tls(SB), AX 151 CMPQ AX, $0x123 152 JEQ 2(PC) 153 MOVL AX, 0 // abort 154 ok: 155 // set the per-goroutine and per-mach "registers" 156 get_tls(BX) 157 LEAQ runtime·g0(SB), CX 158 MOVQ CX, g(BX) 159 LEAQ runtime·m0(SB), AX 160 161 // save m->g0 = g0 162 MOVQ CX, m_g0(AX) 163 // save m0 to g0->m 164 MOVQ AX, g_m(CX) 165 166 CLD // convention is D is always left cleared 167 CALL runtime·check(SB) 168 169 MOVL 16(SP), AX // copy argc 170 MOVL AX, 0(SP) 171 MOVQ 24(SP), AX // copy argv 172 MOVQ AX, 8(SP) 173 CALL runtime·args(SB) 174 CALL runtime·osinit(SB) 175 CALL runtime·schedinit(SB) 176 177 // create a new goroutine to start program 178 MOVQ $runtime·mainPC(SB), AX // entry 179 PUSHQ AX 180 PUSHQ $0 // arg size 181 CALL runtime·newproc(SB) 182 POPQ AX 183 POPQ AX 184 185 // start this M 186 CALL runtime·mstart(SB) 187 188 MOVL $0xf1, 0xf1 // crash 189 RET 190 191 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 192 GLOBL runtime·mainPC(SB),RODATA,$8 193 194 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 195 BYTE $0xcc 196 RET 197 198 TEXT runtime·asminit(SB),NOSPLIT,$0-0 199 // No per-thread init. 200 RET 201 202 /* 203 * go-routine 204 */ 205 206 // void gosave(Gobuf*) 207 // save state in Gobuf; setjmp 208 TEXT runtime·gosave(SB), NOSPLIT, $0-8 209 MOVQ buf+0(FP), AX // gobuf 210 LEAQ buf+0(FP), BX // caller's SP 211 MOVQ BX, gobuf_sp(AX) 212 MOVQ 0(SP), BX // caller's PC 213 MOVQ BX, gobuf_pc(AX) 214 MOVQ $0, gobuf_ret(AX) 215 MOVQ BP, gobuf_bp(AX) 216 // Assert ctxt is zero. See func save. 217 MOVQ gobuf_ctxt(AX), BX 218 TESTQ BX, BX 219 JZ 2(PC) 220 CALL runtime·badctxt(SB) 221 get_tls(CX) 222 MOVQ g(CX), BX 223 MOVQ BX, gobuf_g(AX) 224 RET 225 226 // void gogo(Gobuf*) 227 // restore state from Gobuf; longjmp 228 TEXT runtime·gogo(SB), NOSPLIT, $16-8 229 MOVQ buf+0(FP), BX // gobuf 230 231 // If ctxt is not nil, invoke deletion barrier before overwriting. 232 MOVQ gobuf_ctxt(BX), AX 233 TESTQ AX, AX 234 JZ nilctxt 235 LEAQ gobuf_ctxt(BX), AX 236 MOVQ AX, 0(SP) 237 MOVQ $0, 8(SP) 238 CALL runtime·writebarrierptr_prewrite(SB) 239 MOVQ buf+0(FP), BX 240 241 nilctxt: 242 MOVQ gobuf_g(BX), DX 243 MOVQ 0(DX), CX // make sure g != nil 244 get_tls(CX) 245 MOVQ DX, g(CX) 246 MOVQ gobuf_sp(BX), SP // restore SP 247 MOVQ gobuf_ret(BX), AX 248 MOVQ gobuf_ctxt(BX), DX 249 MOVQ gobuf_bp(BX), BP 250 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 251 MOVQ $0, gobuf_ret(BX) 252 MOVQ $0, gobuf_ctxt(BX) 253 MOVQ $0, gobuf_bp(BX) 254 MOVQ gobuf_pc(BX), BX 255 JMP BX 256 257 // func mcall(fn func(*g)) 258 // Switch to m->g0's stack, call fn(g). 259 // Fn must never return. It should gogo(&g->sched) 260 // to keep running g. 261 TEXT runtime·mcall(SB), NOSPLIT, $0-8 262 MOVQ fn+0(FP), DI 263 264 get_tls(CX) 265 MOVQ g(CX), AX // save state in g->sched 266 MOVQ 0(SP), BX // caller's PC 267 MOVQ BX, (g_sched+gobuf_pc)(AX) 268 LEAQ fn+0(FP), BX // caller's SP 269 MOVQ BX, (g_sched+gobuf_sp)(AX) 270 MOVQ AX, (g_sched+gobuf_g)(AX) 271 MOVQ BP, (g_sched+gobuf_bp)(AX) 272 273 // switch to m->g0 & its stack, call fn 274 MOVQ g(CX), BX 275 MOVQ g_m(BX), BX 276 MOVQ m_g0(BX), SI 277 CMPQ SI, AX // if g == m->g0 call badmcall 278 JNE 3(PC) 279 MOVQ $runtime·badmcall(SB), AX 280 JMP AX 281 MOVQ SI, g(CX) // g = m->g0 282 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 283 PUSHQ AX 284 MOVQ DI, DX 285 MOVQ 0(DI), DI 286 CALL DI 287 POPQ AX 288 MOVQ $runtime·badmcall2(SB), AX 289 JMP AX 290 RET 291 292 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 293 // of the G stack. We need to distinguish the routine that 294 // lives at the bottom of the G stack from the one that lives 295 // at the top of the system stack because the one at the top of 296 // the system stack terminates the stack walk (see topofstack()). 297 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 298 RET 299 300 // func systemstack(fn func()) 301 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 302 MOVQ fn+0(FP), DI // DI = fn 303 get_tls(CX) 304 MOVQ g(CX), AX // AX = g 305 MOVQ g_m(AX), BX // BX = m 306 307 MOVQ m_gsignal(BX), DX // DX = gsignal 308 CMPQ AX, DX 309 JEQ noswitch 310 311 MOVQ m_g0(BX), DX // DX = g0 312 CMPQ AX, DX 313 JEQ noswitch 314 315 MOVQ m_curg(BX), R8 316 CMPQ AX, R8 317 JEQ switch 318 319 // Bad: g is not gsignal, not g0, not curg. What is it? 320 MOVQ $runtime·badsystemstack(SB), AX 321 CALL AX 322 323 switch: 324 // save our state in g->sched. Pretend to 325 // be systemstack_switch if the G stack is scanned. 326 MOVQ $runtime·systemstack_switch(SB), SI 327 MOVQ SI, (g_sched+gobuf_pc)(AX) 328 MOVQ SP, (g_sched+gobuf_sp)(AX) 329 MOVQ AX, (g_sched+gobuf_g)(AX) 330 MOVQ BP, (g_sched+gobuf_bp)(AX) 331 332 // switch to g0 333 MOVQ DX, g(CX) 334 MOVQ (g_sched+gobuf_sp)(DX), BX 335 // make it look like mstart called systemstack on g0, to stop traceback 336 SUBQ $8, BX 337 MOVQ $runtime·mstart(SB), DX 338 MOVQ DX, 0(BX) 339 MOVQ BX, SP 340 341 // call target function 342 MOVQ DI, DX 343 MOVQ 0(DI), DI 344 CALL DI 345 346 // switch back to g 347 get_tls(CX) 348 MOVQ g(CX), AX 349 MOVQ g_m(AX), BX 350 MOVQ m_curg(BX), AX 351 MOVQ AX, g(CX) 352 MOVQ (g_sched+gobuf_sp)(AX), SP 353 MOVQ $0, (g_sched+gobuf_sp)(AX) 354 RET 355 356 noswitch: 357 // already on m stack, just call directly 358 MOVQ DI, DX 359 MOVQ 0(DI), DI 360 CALL DI 361 RET 362 363 /* 364 * support for morestack 365 */ 366 367 // Called during function prolog when more stack is needed. 368 // 369 // The traceback routines see morestack on a g0 as being 370 // the top of a stack (for example, morestack calling newstack 371 // calling the scheduler calling newm calling gc), so we must 372 // record an argument size. For that purpose, it has no arguments. 373 TEXT runtime·morestack(SB),NOSPLIT,$0-0 374 // Cannot grow scheduler stack (m->g0). 375 get_tls(CX) 376 MOVQ g(CX), BX 377 MOVQ g_m(BX), BX 378 MOVQ m_g0(BX), SI 379 CMPQ g(CX), SI 380 JNE 3(PC) 381 CALL runtime·badmorestackg0(SB) 382 INT $3 383 384 // Cannot grow signal stack (m->gsignal). 385 MOVQ m_gsignal(BX), SI 386 CMPQ g(CX), SI 387 JNE 3(PC) 388 CALL runtime·badmorestackgsignal(SB) 389 INT $3 390 391 // Called from f. 392 // Set m->morebuf to f's caller. 393 MOVQ 8(SP), AX // f's caller's PC 394 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 395 LEAQ 16(SP), AX // f's caller's SP 396 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 397 get_tls(CX) 398 MOVQ g(CX), SI 399 MOVQ SI, (m_morebuf+gobuf_g)(BX) 400 401 // Set g->sched to context in f. 402 MOVQ 0(SP), AX // f's PC 403 MOVQ AX, (g_sched+gobuf_pc)(SI) 404 MOVQ SI, (g_sched+gobuf_g)(SI) 405 LEAQ 8(SP), AX // f's SP 406 MOVQ AX, (g_sched+gobuf_sp)(SI) 407 MOVQ BP, (g_sched+gobuf_bp)(SI) 408 // newstack will fill gobuf.ctxt. 409 410 // Call newstack on m->g0's stack. 411 MOVQ m_g0(BX), BX 412 MOVQ BX, g(CX) 413 MOVQ (g_sched+gobuf_sp)(BX), SP 414 PUSHQ DX // ctxt argument 415 CALL runtime·newstack(SB) 416 MOVQ $0, 0x1003 // crash if newstack returns 417 POPQ DX // keep balance check happy 418 RET 419 420 // morestack but not preserving ctxt. 421 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 422 MOVL $0, DX 423 JMP runtime·morestack(SB) 424 425 // reflectcall: call a function with the given argument list 426 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 427 // we don't have variable-sized frames, so we use a small number 428 // of constant-sized-frame functions to encode a few bits of size in the pc. 429 // Caution: ugly multiline assembly macros in your future! 430 431 #define DISPATCH(NAME,MAXSIZE) \ 432 CMPQ CX, $MAXSIZE; \ 433 JA 3(PC); \ 434 MOVQ $NAME(SB), AX; \ 435 JMP AX 436 // Note: can't just "JMP NAME(SB)" - bad inlining results. 437 438 TEXT reflect·call(SB), NOSPLIT, $0-0 439 JMP ·reflectcall(SB) 440 441 TEXT ·reflectcall(SB), NOSPLIT, $0-32 442 MOVLQZX argsize+24(FP), CX 443 DISPATCH(runtime·call32, 32) 444 DISPATCH(runtime·call64, 64) 445 DISPATCH(runtime·call128, 128) 446 DISPATCH(runtime·call256, 256) 447 DISPATCH(runtime·call512, 512) 448 DISPATCH(runtime·call1024, 1024) 449 DISPATCH(runtime·call2048, 2048) 450 DISPATCH(runtime·call4096, 4096) 451 DISPATCH(runtime·call8192, 8192) 452 DISPATCH(runtime·call16384, 16384) 453 DISPATCH(runtime·call32768, 32768) 454 DISPATCH(runtime·call65536, 65536) 455 DISPATCH(runtime·call131072, 131072) 456 DISPATCH(runtime·call262144, 262144) 457 DISPATCH(runtime·call524288, 524288) 458 DISPATCH(runtime·call1048576, 1048576) 459 DISPATCH(runtime·call2097152, 2097152) 460 DISPATCH(runtime·call4194304, 4194304) 461 DISPATCH(runtime·call8388608, 8388608) 462 DISPATCH(runtime·call16777216, 16777216) 463 DISPATCH(runtime·call33554432, 33554432) 464 DISPATCH(runtime·call67108864, 67108864) 465 DISPATCH(runtime·call134217728, 134217728) 466 DISPATCH(runtime·call268435456, 268435456) 467 DISPATCH(runtime·call536870912, 536870912) 468 DISPATCH(runtime·call1073741824, 1073741824) 469 MOVQ $runtime·badreflectcall(SB), AX 470 JMP AX 471 472 #define CALLFN(NAME,MAXSIZE) \ 473 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 474 NO_LOCAL_POINTERS; \ 475 /* copy arguments to stack */ \ 476 MOVQ argptr+16(FP), SI; \ 477 MOVLQZX argsize+24(FP), CX; \ 478 MOVQ SP, DI; \ 479 REP;MOVSB; \ 480 /* call function */ \ 481 MOVQ f+8(FP), DX; \ 482 PCDATA $PCDATA_StackMapIndex, $0; \ 483 CALL (DX); \ 484 /* copy return values back */ \ 485 MOVQ argtype+0(FP), DX; \ 486 MOVQ argptr+16(FP), DI; \ 487 MOVLQZX argsize+24(FP), CX; \ 488 MOVLQZX retoffset+28(FP), BX; \ 489 MOVQ SP, SI; \ 490 ADDQ BX, DI; \ 491 ADDQ BX, SI; \ 492 SUBQ BX, CX; \ 493 CALL callRet<>(SB); \ 494 RET 495 496 // callRet copies return values back at the end of call*. This is a 497 // separate function so it can allocate stack space for the arguments 498 // to reflectcallmove. It does not follow the Go ABI; it expects its 499 // arguments in registers. 500 TEXT callRet<>(SB), NOSPLIT, $32-0 501 NO_LOCAL_POINTERS 502 MOVQ DX, 0(SP) 503 MOVQ DI, 8(SP) 504 MOVQ SI, 16(SP) 505 MOVQ CX, 24(SP) 506 CALL runtime·reflectcallmove(SB) 507 RET 508 509 CALLFN(·call32, 32) 510 CALLFN(·call64, 64) 511 CALLFN(·call128, 128) 512 CALLFN(·call256, 256) 513 CALLFN(·call512, 512) 514 CALLFN(·call1024, 1024) 515 CALLFN(·call2048, 2048) 516 CALLFN(·call4096, 4096) 517 CALLFN(·call8192, 8192) 518 CALLFN(·call16384, 16384) 519 CALLFN(·call32768, 32768) 520 CALLFN(·call65536, 65536) 521 CALLFN(·call131072, 131072) 522 CALLFN(·call262144, 262144) 523 CALLFN(·call524288, 524288) 524 CALLFN(·call1048576, 1048576) 525 CALLFN(·call2097152, 2097152) 526 CALLFN(·call4194304, 4194304) 527 CALLFN(·call8388608, 8388608) 528 CALLFN(·call16777216, 16777216) 529 CALLFN(·call33554432, 33554432) 530 CALLFN(·call67108864, 67108864) 531 CALLFN(·call134217728, 134217728) 532 CALLFN(·call268435456, 268435456) 533 CALLFN(·call536870912, 536870912) 534 CALLFN(·call1073741824, 1073741824) 535 536 TEXT runtime·procyield(SB),NOSPLIT,$0-0 537 MOVL cycles+0(FP), AX 538 again: 539 PAUSE 540 SUBL $1, AX 541 JNZ again 542 RET 543 544 545 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 546 // Stores are already ordered on x86, so this is just a 547 // compile barrier. 548 RET 549 550 // void jmpdefer(fn, sp); 551 // called from deferreturn. 552 // 1. pop the caller 553 // 2. sub 5 bytes from the callers return 554 // 3. jmp to the argument 555 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 556 MOVQ fv+0(FP), DX // fn 557 MOVQ argp+8(FP), BX // caller sp 558 LEAQ -8(BX), SP // caller sp after CALL 559 MOVQ -8(SP), BP // restore BP as if deferreturn returned (harmless if framepointers not in use) 560 SUBQ $5, (SP) // return to CALL again 561 MOVQ 0(DX), BX 562 JMP BX // but first run the deferred function 563 564 // Save state of caller into g->sched. Smashes R8, R9. 565 TEXT gosave<>(SB),NOSPLIT,$0 566 get_tls(R8) 567 MOVQ g(R8), R8 568 MOVQ 0(SP), R9 569 MOVQ R9, (g_sched+gobuf_pc)(R8) 570 LEAQ 8(SP), R9 571 MOVQ R9, (g_sched+gobuf_sp)(R8) 572 MOVQ $0, (g_sched+gobuf_ret)(R8) 573 MOVQ BP, (g_sched+gobuf_bp)(R8) 574 // Assert ctxt is zero. See func save. 575 MOVQ (g_sched+gobuf_ctxt)(R8), R9 576 TESTQ R9, R9 577 JZ 2(PC) 578 CALL runtime·badctxt(SB) 579 RET 580 581 // func asmcgocall(fn, arg unsafe.Pointer) int32 582 // Call fn(arg) on the scheduler stack, 583 // aligned appropriately for the gcc ABI. 584 // See cgocall.go for more details. 585 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 586 MOVQ fn+0(FP), AX 587 MOVQ arg+8(FP), BX 588 589 MOVQ SP, DX 590 591 // Figure out if we need to switch to m->g0 stack. 592 // We get called to create new OS threads too, and those 593 // come in on the m->g0 stack already. 594 get_tls(CX) 595 MOVQ g(CX), R8 596 CMPQ R8, $0 597 JEQ nosave 598 MOVQ g_m(R8), R8 599 MOVQ m_g0(R8), SI 600 MOVQ g(CX), DI 601 CMPQ SI, DI 602 JEQ nosave 603 MOVQ m_gsignal(R8), SI 604 CMPQ SI, DI 605 JEQ nosave 606 607 // Switch to system stack. 608 MOVQ m_g0(R8), SI 609 CALL gosave<>(SB) 610 MOVQ SI, g(CX) 611 MOVQ (g_sched+gobuf_sp)(SI), SP 612 613 // Now on a scheduling stack (a pthread-created stack). 614 // Make sure we have enough room for 4 stack-backed fast-call 615 // registers as per windows amd64 calling convention. 616 SUBQ $64, SP 617 ANDQ $~15, SP // alignment for gcc ABI 618 MOVQ DI, 48(SP) // save g 619 MOVQ (g_stack+stack_hi)(DI), DI 620 SUBQ DX, DI 621 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 622 MOVQ BX, DI // DI = first argument in AMD64 ABI 623 MOVQ BX, CX // CX = first argument in Win64 624 CALL AX 625 626 // Restore registers, g, stack pointer. 627 get_tls(CX) 628 MOVQ 48(SP), DI 629 MOVQ (g_stack+stack_hi)(DI), SI 630 SUBQ 40(SP), SI 631 MOVQ DI, g(CX) 632 MOVQ SI, SP 633 634 MOVL AX, ret+16(FP) 635 RET 636 637 nosave: 638 // Running on a system stack, perhaps even without a g. 639 // Having no g can happen during thread creation or thread teardown 640 // (see needm/dropm on Solaris, for example). 641 // This code is like the above sequence but without saving/restoring g 642 // and without worrying about the stack moving out from under us 643 // (because we're on a system stack, not a goroutine stack). 644 // The above code could be used directly if already on a system stack, 645 // but then the only path through this code would be a rare case on Solaris. 646 // Using this code for all "already on system stack" calls exercises it more, 647 // which should help keep it correct. 648 SUBQ $64, SP 649 ANDQ $~15, SP 650 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging 651 MOVQ DX, 40(SP) // save original stack pointer 652 MOVQ BX, DI // DI = first argument in AMD64 ABI 653 MOVQ BX, CX // CX = first argument in Win64 654 CALL AX 655 MOVQ 40(SP), SI // restore original stack pointer 656 MOVQ SI, SP 657 MOVL AX, ret+16(FP) 658 RET 659 660 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt) 661 // Turn the fn into a Go func (by taking its address) and call 662 // cgocallback_gofunc. 663 TEXT runtime·cgocallback(SB),NOSPLIT,$32-32 664 LEAQ fn+0(FP), AX 665 MOVQ AX, 0(SP) 666 MOVQ frame+8(FP), AX 667 MOVQ AX, 8(SP) 668 MOVQ framesize+16(FP), AX 669 MOVQ AX, 16(SP) 670 MOVQ ctxt+24(FP), AX 671 MOVQ AX, 24(SP) 672 MOVQ $runtime·cgocallback_gofunc(SB), AX 673 CALL AX 674 RET 675 676 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt) 677 // See cgocall.go for more details. 678 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32 679 NO_LOCAL_POINTERS 680 681 // If g is nil, Go did not create the current thread. 682 // Call needm to obtain one m for temporary use. 683 // In this case, we're running on the thread stack, so there's 684 // lots of space, but the linker doesn't know. Hide the call from 685 // the linker analysis by using an indirect call through AX. 686 get_tls(CX) 687 #ifdef GOOS_windows 688 MOVL $0, BX 689 CMPQ CX, $0 690 JEQ 2(PC) 691 #endif 692 MOVQ g(CX), BX 693 CMPQ BX, $0 694 JEQ needm 695 MOVQ g_m(BX), BX 696 MOVQ BX, R8 // holds oldm until end of function 697 JMP havem 698 needm: 699 MOVQ $0, 0(SP) 700 MOVQ $runtime·needm(SB), AX 701 CALL AX 702 MOVQ 0(SP), R8 703 get_tls(CX) 704 MOVQ g(CX), BX 705 MOVQ g_m(BX), BX 706 707 // Set m->sched.sp = SP, so that if a panic happens 708 // during the function we are about to execute, it will 709 // have a valid SP to run on the g0 stack. 710 // The next few lines (after the havem label) 711 // will save this SP onto the stack and then write 712 // the same SP back to m->sched.sp. That seems redundant, 713 // but if an unrecovered panic happens, unwindm will 714 // restore the g->sched.sp from the stack location 715 // and then systemstack will try to use it. If we don't set it here, 716 // that restored SP will be uninitialized (typically 0) and 717 // will not be usable. 718 MOVQ m_g0(BX), SI 719 MOVQ SP, (g_sched+gobuf_sp)(SI) 720 721 havem: 722 // Now there's a valid m, and we're running on its m->g0. 723 // Save current m->g0->sched.sp on stack and then set it to SP. 724 // Save current sp in m->g0->sched.sp in preparation for 725 // switch back to m->curg stack. 726 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 727 MOVQ m_g0(BX), SI 728 MOVQ (g_sched+gobuf_sp)(SI), AX 729 MOVQ AX, 0(SP) 730 MOVQ SP, (g_sched+gobuf_sp)(SI) 731 732 // Switch to m->curg stack and call runtime.cgocallbackg. 733 // Because we are taking over the execution of m->curg 734 // but *not* resuming what had been running, we need to 735 // save that information (m->curg->sched) so we can restore it. 736 // We can restore m->curg->sched.sp easily, because calling 737 // runtime.cgocallbackg leaves SP unchanged upon return. 738 // To save m->curg->sched.pc, we push it onto the stack. 739 // This has the added benefit that it looks to the traceback 740 // routine like cgocallbackg is going to return to that 741 // PC (because the frame we allocate below has the same 742 // size as cgocallback_gofunc's frame declared above) 743 // so that the traceback will seamlessly trace back into 744 // the earlier calls. 745 // 746 // In the new goroutine, 8(SP) holds the saved R8. 747 MOVQ m_curg(BX), SI 748 MOVQ SI, g(CX) 749 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 750 MOVQ (g_sched+gobuf_pc)(SI), BX 751 MOVQ BX, -8(DI) 752 // Compute the size of the frame, including return PC and, if 753 // GOEXPERIMENT=framepointer, the saved base pointer 754 MOVQ ctxt+24(FP), BX 755 LEAQ fv+0(FP), AX 756 SUBQ SP, AX 757 SUBQ AX, DI 758 MOVQ DI, SP 759 760 MOVQ R8, 8(SP) 761 MOVQ BX, 0(SP) 762 CALL runtime·cgocallbackg(SB) 763 MOVQ 8(SP), R8 764 765 // Compute the size of the frame again. FP and SP have 766 // completely different values here than they did above, 767 // but only their difference matters. 768 LEAQ fv+0(FP), AX 769 SUBQ SP, AX 770 771 // Restore g->sched (== m->curg->sched) from saved values. 772 get_tls(CX) 773 MOVQ g(CX), SI 774 MOVQ SP, DI 775 ADDQ AX, DI 776 MOVQ -8(DI), BX 777 MOVQ BX, (g_sched+gobuf_pc)(SI) 778 MOVQ DI, (g_sched+gobuf_sp)(SI) 779 780 // Switch back to m->g0's stack and restore m->g0->sched.sp. 781 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 782 // so we do not have to restore it.) 783 MOVQ g(CX), BX 784 MOVQ g_m(BX), BX 785 MOVQ m_g0(BX), SI 786 MOVQ SI, g(CX) 787 MOVQ (g_sched+gobuf_sp)(SI), SP 788 MOVQ 0(SP), AX 789 MOVQ AX, (g_sched+gobuf_sp)(SI) 790 791 // If the m on entry was nil, we called needm above to borrow an m 792 // for the duration of the call. Since the call is over, return it with dropm. 793 CMPQ R8, $0 794 JNE 3(PC) 795 MOVQ $runtime·dropm(SB), AX 796 CALL AX 797 798 // Done! 799 RET 800 801 // void setg(G*); set g. for use by needm. 802 TEXT runtime·setg(SB), NOSPLIT, $0-8 803 MOVQ gg+0(FP), BX 804 #ifdef GOOS_windows 805 CMPQ BX, $0 806 JNE settls 807 MOVQ $0, 0x28(GS) 808 RET 809 settls: 810 MOVQ g_m(BX), AX 811 LEAQ m_tls(AX), AX 812 MOVQ AX, 0x28(GS) 813 #endif 814 get_tls(CX) 815 MOVQ BX, g(CX) 816 RET 817 818 // void setg_gcc(G*); set g called from gcc. 819 TEXT setg_gcc<>(SB),NOSPLIT,$0 820 get_tls(AX) 821 MOVQ DI, g(AX) 822 RET 823 824 // check that SP is in range [g->stack.lo, g->stack.hi) 825 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 826 get_tls(CX) 827 MOVQ g(CX), AX 828 CMPQ (g_stack+stack_hi)(AX), SP 829 JHI 2(PC) 830 INT $3 831 CMPQ SP, (g_stack+stack_lo)(AX) 832 JHI 2(PC) 833 INT $3 834 RET 835 836 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 837 MOVQ argp+0(FP),AX // addr of first arg 838 MOVQ -8(AX),AX // get calling pc 839 MOVQ AX, ret+8(FP) 840 RET 841 842 // func cputicks() int64 843 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 844 CMPB runtime·lfenceBeforeRdtsc(SB), $1 845 JNE mfence 846 LFENCE 847 JMP done 848 mfence: 849 MFENCE 850 done: 851 RDTSC 852 SHLQ $32, DX 853 ADDQ DX, AX 854 MOVQ AX, ret+0(FP) 855 RET 856 857 // hash function using AES hardware instructions 858 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 859 MOVQ p+0(FP), AX // ptr to data 860 MOVQ s+16(FP), CX // size 861 LEAQ ret+24(FP), DX 862 JMP runtime·aeshashbody(SB) 863 864 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 865 MOVQ p+0(FP), AX // ptr to string struct 866 MOVQ 8(AX), CX // length of string 867 MOVQ (AX), AX // string data 868 LEAQ ret+16(FP), DX 869 JMP runtime·aeshashbody(SB) 870 871 // AX: data 872 // CX: length 873 // DX: address to put return value 874 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 875 // Fill an SSE register with our seeds. 876 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 877 PINSRW $4, CX, X0 // 16 bits of length 878 PSHUFHW $0, X0, X0 // repeat length 4 times total 879 MOVO X0, X1 // save unscrambled seed 880 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 881 AESENC X0, X0 // scramble seed 882 883 CMPQ CX, $16 884 JB aes0to15 885 JE aes16 886 CMPQ CX, $32 887 JBE aes17to32 888 CMPQ CX, $64 889 JBE aes33to64 890 CMPQ CX, $128 891 JBE aes65to128 892 JMP aes129plus 893 894 aes0to15: 895 TESTQ CX, CX 896 JE aes0 897 898 ADDQ $16, AX 899 TESTW $0xff0, AX 900 JE endofpage 901 902 // 16 bytes loaded at this address won't cross 903 // a page boundary, so we can load it directly. 904 MOVOU -16(AX), X1 905 ADDQ CX, CX 906 MOVQ $masks<>(SB), AX 907 PAND (AX)(CX*8), X1 908 final1: 909 PXOR X0, X1 // xor data with seed 910 AESENC X1, X1 // scramble combo 3 times 911 AESENC X1, X1 912 AESENC X1, X1 913 MOVQ X1, (DX) 914 RET 915 916 endofpage: 917 // address ends in 1111xxxx. Might be up against 918 // a page boundary, so load ending at last byte. 919 // Then shift bytes down using pshufb. 920 MOVOU -32(AX)(CX*1), X1 921 ADDQ CX, CX 922 MOVQ $shifts<>(SB), AX 923 PSHUFB (AX)(CX*8), X1 924 JMP final1 925 926 aes0: 927 // Return scrambled input seed 928 AESENC X0, X0 929 MOVQ X0, (DX) 930 RET 931 932 aes16: 933 MOVOU (AX), X1 934 JMP final1 935 936 aes17to32: 937 // make second starting seed 938 PXOR runtime·aeskeysched+16(SB), X1 939 AESENC X1, X1 940 941 // load data to be hashed 942 MOVOU (AX), X2 943 MOVOU -16(AX)(CX*1), X3 944 945 // xor with seed 946 PXOR X0, X2 947 PXOR X1, X3 948 949 // scramble 3 times 950 AESENC X2, X2 951 AESENC X3, X3 952 AESENC X2, X2 953 AESENC X3, X3 954 AESENC X2, X2 955 AESENC X3, X3 956 957 // combine results 958 PXOR X3, X2 959 MOVQ X2, (DX) 960 RET 961 962 aes33to64: 963 // make 3 more starting seeds 964 MOVO X1, X2 965 MOVO X1, X3 966 PXOR runtime·aeskeysched+16(SB), X1 967 PXOR runtime·aeskeysched+32(SB), X2 968 PXOR runtime·aeskeysched+48(SB), X3 969 AESENC X1, X1 970 AESENC X2, X2 971 AESENC X3, X3 972 973 MOVOU (AX), X4 974 MOVOU 16(AX), X5 975 MOVOU -32(AX)(CX*1), X6 976 MOVOU -16(AX)(CX*1), X7 977 978 PXOR X0, X4 979 PXOR X1, X5 980 PXOR X2, X6 981 PXOR X3, X7 982 983 AESENC X4, X4 984 AESENC X5, X5 985 AESENC X6, X6 986 AESENC X7, X7 987 988 AESENC X4, X4 989 AESENC X5, X5 990 AESENC X6, X6 991 AESENC X7, X7 992 993 AESENC X4, X4 994 AESENC X5, X5 995 AESENC X6, X6 996 AESENC X7, X7 997 998 PXOR X6, X4 999 PXOR X7, X5 1000 PXOR X5, X4 1001 MOVQ X4, (DX) 1002 RET 1003 1004 aes65to128: 1005 // make 7 more starting seeds 1006 MOVO X1, X2 1007 MOVO X1, X3 1008 MOVO X1, X4 1009 MOVO X1, X5 1010 MOVO X1, X6 1011 MOVO X1, X7 1012 PXOR runtime·aeskeysched+16(SB), X1 1013 PXOR runtime·aeskeysched+32(SB), X2 1014 PXOR runtime·aeskeysched+48(SB), X3 1015 PXOR runtime·aeskeysched+64(SB), X4 1016 PXOR runtime·aeskeysched+80(SB), X5 1017 PXOR runtime·aeskeysched+96(SB), X6 1018 PXOR runtime·aeskeysched+112(SB), X7 1019 AESENC X1, X1 1020 AESENC X2, X2 1021 AESENC X3, X3 1022 AESENC X4, X4 1023 AESENC X5, X5 1024 AESENC X6, X6 1025 AESENC X7, X7 1026 1027 // load data 1028 MOVOU (AX), X8 1029 MOVOU 16(AX), X9 1030 MOVOU 32(AX), X10 1031 MOVOU 48(AX), X11 1032 MOVOU -64(AX)(CX*1), X12 1033 MOVOU -48(AX)(CX*1), X13 1034 MOVOU -32(AX)(CX*1), X14 1035 MOVOU -16(AX)(CX*1), X15 1036 1037 // xor with seed 1038 PXOR X0, X8 1039 PXOR X1, X9 1040 PXOR X2, X10 1041 PXOR X3, X11 1042 PXOR X4, X12 1043 PXOR X5, X13 1044 PXOR X6, X14 1045 PXOR X7, X15 1046 1047 // scramble 3 times 1048 AESENC X8, X8 1049 AESENC X9, X9 1050 AESENC X10, X10 1051 AESENC X11, X11 1052 AESENC X12, X12 1053 AESENC X13, X13 1054 AESENC X14, X14 1055 AESENC X15, X15 1056 1057 AESENC X8, X8 1058 AESENC X9, X9 1059 AESENC X10, X10 1060 AESENC X11, X11 1061 AESENC X12, X12 1062 AESENC X13, X13 1063 AESENC X14, X14 1064 AESENC X15, X15 1065 1066 AESENC X8, X8 1067 AESENC X9, X9 1068 AESENC X10, X10 1069 AESENC X11, X11 1070 AESENC X12, X12 1071 AESENC X13, X13 1072 AESENC X14, X14 1073 AESENC X15, X15 1074 1075 // combine results 1076 PXOR X12, X8 1077 PXOR X13, X9 1078 PXOR X14, X10 1079 PXOR X15, X11 1080 PXOR X10, X8 1081 PXOR X11, X9 1082 PXOR X9, X8 1083 MOVQ X8, (DX) 1084 RET 1085 1086 aes129plus: 1087 // make 7 more starting seeds 1088 MOVO X1, X2 1089 MOVO X1, X3 1090 MOVO X1, X4 1091 MOVO X1, X5 1092 MOVO X1, X6 1093 MOVO X1, X7 1094 PXOR runtime·aeskeysched+16(SB), X1 1095 PXOR runtime·aeskeysched+32(SB), X2 1096 PXOR runtime·aeskeysched+48(SB), X3 1097 PXOR runtime·aeskeysched+64(SB), X4 1098 PXOR runtime·aeskeysched+80(SB), X5 1099 PXOR runtime·aeskeysched+96(SB), X6 1100 PXOR runtime·aeskeysched+112(SB), X7 1101 AESENC X1, X1 1102 AESENC X2, X2 1103 AESENC X3, X3 1104 AESENC X4, X4 1105 AESENC X5, X5 1106 AESENC X6, X6 1107 AESENC X7, X7 1108 1109 // start with last (possibly overlapping) block 1110 MOVOU -128(AX)(CX*1), X8 1111 MOVOU -112(AX)(CX*1), X9 1112 MOVOU -96(AX)(CX*1), X10 1113 MOVOU -80(AX)(CX*1), X11 1114 MOVOU -64(AX)(CX*1), X12 1115 MOVOU -48(AX)(CX*1), X13 1116 MOVOU -32(AX)(CX*1), X14 1117 MOVOU -16(AX)(CX*1), X15 1118 1119 // xor in seed 1120 PXOR X0, X8 1121 PXOR X1, X9 1122 PXOR X2, X10 1123 PXOR X3, X11 1124 PXOR X4, X12 1125 PXOR X5, X13 1126 PXOR X6, X14 1127 PXOR X7, X15 1128 1129 // compute number of remaining 128-byte blocks 1130 DECQ CX 1131 SHRQ $7, CX 1132 1133 aesloop: 1134 // scramble state 1135 AESENC X8, X8 1136 AESENC X9, X9 1137 AESENC X10, X10 1138 AESENC X11, X11 1139 AESENC X12, X12 1140 AESENC X13, X13 1141 AESENC X14, X14 1142 AESENC X15, X15 1143 1144 // scramble state, xor in a block 1145 MOVOU (AX), X0 1146 MOVOU 16(AX), X1 1147 MOVOU 32(AX), X2 1148 MOVOU 48(AX), X3 1149 AESENC X0, X8 1150 AESENC X1, X9 1151 AESENC X2, X10 1152 AESENC X3, X11 1153 MOVOU 64(AX), X4 1154 MOVOU 80(AX), X5 1155 MOVOU 96(AX), X6 1156 MOVOU 112(AX), X7 1157 AESENC X4, X12 1158 AESENC X5, X13 1159 AESENC X6, X14 1160 AESENC X7, X15 1161 1162 ADDQ $128, AX 1163 DECQ CX 1164 JNE aesloop 1165 1166 // 3 more scrambles to finish 1167 AESENC X8, X8 1168 AESENC X9, X9 1169 AESENC X10, X10 1170 AESENC X11, X11 1171 AESENC X12, X12 1172 AESENC X13, X13 1173 AESENC X14, X14 1174 AESENC X15, X15 1175 AESENC X8, X8 1176 AESENC X9, X9 1177 AESENC X10, X10 1178 AESENC X11, X11 1179 AESENC X12, X12 1180 AESENC X13, X13 1181 AESENC X14, X14 1182 AESENC X15, X15 1183 AESENC X8, X8 1184 AESENC X9, X9 1185 AESENC X10, X10 1186 AESENC X11, X11 1187 AESENC X12, X12 1188 AESENC X13, X13 1189 AESENC X14, X14 1190 AESENC X15, X15 1191 1192 PXOR X12, X8 1193 PXOR X13, X9 1194 PXOR X14, X10 1195 PXOR X15, X11 1196 PXOR X10, X8 1197 PXOR X11, X9 1198 PXOR X9, X8 1199 MOVQ X8, (DX) 1200 RET 1201 1202 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1203 MOVQ p+0(FP), AX // ptr to data 1204 MOVQ h+8(FP), X0 // seed 1205 PINSRD $2, (AX), X0 // data 1206 AESENC runtime·aeskeysched+0(SB), X0 1207 AESENC runtime·aeskeysched+16(SB), X0 1208 AESENC runtime·aeskeysched+32(SB), X0 1209 MOVQ X0, ret+16(FP) 1210 RET 1211 1212 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1213 MOVQ p+0(FP), AX // ptr to data 1214 MOVQ h+8(FP), X0 // seed 1215 PINSRQ $1, (AX), X0 // data 1216 AESENC runtime·aeskeysched+0(SB), X0 1217 AESENC runtime·aeskeysched+16(SB), X0 1218 AESENC runtime·aeskeysched+32(SB), X0 1219 MOVQ X0, ret+16(FP) 1220 RET 1221 1222 // simple mask to get rid of data in the high part of the register. 1223 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1224 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1225 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1226 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1227 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1228 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1229 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1230 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1231 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1232 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1233 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1234 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1235 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1236 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1237 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1238 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1239 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1240 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1241 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1242 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1243 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1244 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1245 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1246 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1247 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1248 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1249 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1250 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1251 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1252 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1253 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1254 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1255 GLOBL masks<>(SB),RODATA,$256 1256 1257 TEXT ·checkASM(SB),NOSPLIT,$0-1 1258 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte 1259 MOVQ $masks<>(SB), AX 1260 MOVQ $shifts<>(SB), BX 1261 ORQ BX, AX 1262 TESTQ $15, AX 1263 SETEQ ret+0(FP) 1264 RET 1265 1266 // these are arguments to pshufb. They move data down from 1267 // the high bytes of the register to the low bytes of the register. 1268 // index is how many bytes to move. 1269 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1270 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1271 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1272 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1273 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1274 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1275 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1276 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1277 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1278 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1279 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1280 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1281 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1282 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1283 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1284 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1285 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1286 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1287 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1288 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1289 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1290 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1291 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1292 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1293 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1294 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1295 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1296 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1297 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1298 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1299 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1300 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1301 GLOBL shifts<>(SB),RODATA,$256 1302 1303 // memequal(p, q unsafe.Pointer, size uintptr) bool 1304 TEXT runtime·memequal(SB),NOSPLIT,$0-25 1305 MOVQ a+0(FP), SI 1306 MOVQ b+8(FP), DI 1307 CMPQ SI, DI 1308 JEQ eq 1309 MOVQ size+16(FP), BX 1310 LEAQ ret+24(FP), AX 1311 JMP runtime·memeqbody(SB) 1312 eq: 1313 MOVB $1, ret+24(FP) 1314 RET 1315 1316 // memequal_varlen(a, b unsafe.Pointer) bool 1317 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1318 MOVQ a+0(FP), SI 1319 MOVQ b+8(FP), DI 1320 CMPQ SI, DI 1321 JEQ eq 1322 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1323 LEAQ ret+16(FP), AX 1324 JMP runtime·memeqbody(SB) 1325 eq: 1326 MOVB $1, ret+16(FP) 1327 RET 1328 1329 // a in SI 1330 // b in DI 1331 // count in BX 1332 // address of result byte in AX 1333 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1334 CMPQ BX, $8 1335 JB small 1336 CMPQ BX, $64 1337 JB bigloop 1338 CMPB runtime·support_avx2(SB), $1 1339 JE hugeloop_avx2 1340 1341 // 64 bytes at a time using xmm registers 1342 hugeloop: 1343 CMPQ BX, $64 1344 JB bigloop 1345 MOVOU (SI), X0 1346 MOVOU (DI), X1 1347 MOVOU 16(SI), X2 1348 MOVOU 16(DI), X3 1349 MOVOU 32(SI), X4 1350 MOVOU 32(DI), X5 1351 MOVOU 48(SI), X6 1352 MOVOU 48(DI), X7 1353 PCMPEQB X1, X0 1354 PCMPEQB X3, X2 1355 PCMPEQB X5, X4 1356 PCMPEQB X7, X6 1357 PAND X2, X0 1358 PAND X6, X4 1359 PAND X4, X0 1360 PMOVMSKB X0, DX 1361 ADDQ $64, SI 1362 ADDQ $64, DI 1363 SUBQ $64, BX 1364 CMPL DX, $0xffff 1365 JEQ hugeloop 1366 MOVB $0, (AX) 1367 RET 1368 1369 // 64 bytes at a time using ymm registers 1370 hugeloop_avx2: 1371 CMPQ BX, $64 1372 JB bigloop_avx2 1373 VMOVDQU (SI), Y0 1374 VMOVDQU (DI), Y1 1375 VMOVDQU 32(SI), Y2 1376 VMOVDQU 32(DI), Y3 1377 VPCMPEQB Y1, Y0, Y4 1378 VPCMPEQB Y2, Y3, Y5 1379 VPAND Y4, Y5, Y6 1380 VPMOVMSKB Y6, DX 1381 ADDQ $64, SI 1382 ADDQ $64, DI 1383 SUBQ $64, BX 1384 CMPL DX, $0xffffffff 1385 JEQ hugeloop_avx2 1386 VZEROUPPER 1387 MOVB $0, (AX) 1388 RET 1389 1390 bigloop_avx2: 1391 VZEROUPPER 1392 1393 // 8 bytes at a time using 64-bit register 1394 bigloop: 1395 CMPQ BX, $8 1396 JBE leftover 1397 MOVQ (SI), CX 1398 MOVQ (DI), DX 1399 ADDQ $8, SI 1400 ADDQ $8, DI 1401 SUBQ $8, BX 1402 CMPQ CX, DX 1403 JEQ bigloop 1404 MOVB $0, (AX) 1405 RET 1406 1407 // remaining 0-8 bytes 1408 leftover: 1409 MOVQ -8(SI)(BX*1), CX 1410 MOVQ -8(DI)(BX*1), DX 1411 CMPQ CX, DX 1412 SETEQ (AX) 1413 RET 1414 1415 small: 1416 CMPQ BX, $0 1417 JEQ equal 1418 1419 LEAQ 0(BX*8), CX 1420 NEGQ CX 1421 1422 CMPB SI, $0xf8 1423 JA si_high 1424 1425 // load at SI won't cross a page boundary. 1426 MOVQ (SI), SI 1427 JMP si_finish 1428 si_high: 1429 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1430 MOVQ -8(SI)(BX*1), SI 1431 SHRQ CX, SI 1432 si_finish: 1433 1434 // same for DI. 1435 CMPB DI, $0xf8 1436 JA di_high 1437 MOVQ (DI), DI 1438 JMP di_finish 1439 di_high: 1440 MOVQ -8(DI)(BX*1), DI 1441 SHRQ CX, DI 1442 di_finish: 1443 1444 SUBQ SI, DI 1445 SHLQ CX, DI 1446 equal: 1447 SETEQ (AX) 1448 RET 1449 1450 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1451 MOVQ s1_base+0(FP), SI 1452 MOVQ s1_len+8(FP), BX 1453 MOVQ s2_base+16(FP), DI 1454 MOVQ s2_len+24(FP), DX 1455 LEAQ ret+32(FP), R9 1456 JMP runtime·cmpbody(SB) 1457 1458 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1459 MOVQ s1+0(FP), SI 1460 MOVQ s1+8(FP), BX 1461 MOVQ s2+24(FP), DI 1462 MOVQ s2+32(FP), DX 1463 LEAQ res+48(FP), R9 1464 JMP runtime·cmpbody(SB) 1465 1466 // input: 1467 // SI = a 1468 // DI = b 1469 // BX = alen 1470 // DX = blen 1471 // R9 = address of output word (stores -1/0/1 here) 1472 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1473 CMPQ SI, DI 1474 JEQ allsame 1475 CMPQ BX, DX 1476 MOVQ DX, R8 1477 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1478 CMPQ R8, $8 1479 JB small 1480 1481 CMPQ R8, $63 1482 JBE loop 1483 CMPB runtime·support_avx2(SB), $1 1484 JEQ big_loop_avx2 1485 JMP big_loop 1486 loop: 1487 CMPQ R8, $16 1488 JBE _0through16 1489 MOVOU (SI), X0 1490 MOVOU (DI), X1 1491 PCMPEQB X0, X1 1492 PMOVMSKB X1, AX 1493 XORQ $0xffff, AX // convert EQ to NE 1494 JNE diff16 // branch if at least one byte is not equal 1495 ADDQ $16, SI 1496 ADDQ $16, DI 1497 SUBQ $16, R8 1498 JMP loop 1499 1500 diff64: 1501 ADDQ $48, SI 1502 ADDQ $48, DI 1503 JMP diff16 1504 diff48: 1505 ADDQ $32, SI 1506 ADDQ $32, DI 1507 JMP diff16 1508 diff32: 1509 ADDQ $16, SI 1510 ADDQ $16, DI 1511 // AX = bit mask of differences 1512 diff16: 1513 BSFQ AX, BX // index of first byte that differs 1514 XORQ AX, AX 1515 MOVB (SI)(BX*1), CX 1516 CMPB CX, (DI)(BX*1) 1517 SETHI AX 1518 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1519 MOVQ AX, (R9) 1520 RET 1521 1522 // 0 through 16 bytes left, alen>=8, blen>=8 1523 _0through16: 1524 CMPQ R8, $8 1525 JBE _0through8 1526 MOVQ (SI), AX 1527 MOVQ (DI), CX 1528 CMPQ AX, CX 1529 JNE diff8 1530 _0through8: 1531 MOVQ -8(SI)(R8*1), AX 1532 MOVQ -8(DI)(R8*1), CX 1533 CMPQ AX, CX 1534 JEQ allsame 1535 1536 // AX and CX contain parts of a and b that differ. 1537 diff8: 1538 BSWAPQ AX // reverse order of bytes 1539 BSWAPQ CX 1540 XORQ AX, CX 1541 BSRQ CX, CX // index of highest bit difference 1542 SHRQ CX, AX // move a's bit to bottom 1543 ANDQ $1, AX // mask bit 1544 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1545 MOVQ AX, (R9) 1546 RET 1547 1548 // 0-7 bytes in common 1549 small: 1550 LEAQ (R8*8), CX // bytes left -> bits left 1551 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1552 JEQ allsame 1553 1554 // load bytes of a into high bytes of AX 1555 CMPB SI, $0xf8 1556 JA si_high 1557 MOVQ (SI), SI 1558 JMP si_finish 1559 si_high: 1560 MOVQ -8(SI)(R8*1), SI 1561 SHRQ CX, SI 1562 si_finish: 1563 SHLQ CX, SI 1564 1565 // load bytes of b in to high bytes of BX 1566 CMPB DI, $0xf8 1567 JA di_high 1568 MOVQ (DI), DI 1569 JMP di_finish 1570 di_high: 1571 MOVQ -8(DI)(R8*1), DI 1572 SHRQ CX, DI 1573 di_finish: 1574 SHLQ CX, DI 1575 1576 BSWAPQ SI // reverse order of bytes 1577 BSWAPQ DI 1578 XORQ SI, DI // find bit differences 1579 JEQ allsame 1580 BSRQ DI, CX // index of highest bit difference 1581 SHRQ CX, SI // move a's bit to bottom 1582 ANDQ $1, SI // mask bit 1583 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1584 MOVQ AX, (R9) 1585 RET 1586 1587 allsame: 1588 XORQ AX, AX 1589 XORQ CX, CX 1590 CMPQ BX, DX 1591 SETGT AX // 1 if alen > blen 1592 SETEQ CX // 1 if alen == blen 1593 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1594 MOVQ AX, (R9) 1595 RET 1596 1597 // this works for >= 64 bytes of data. 1598 big_loop: 1599 MOVOU (SI), X0 1600 MOVOU (DI), X1 1601 PCMPEQB X0, X1 1602 PMOVMSKB X1, AX 1603 XORQ $0xffff, AX 1604 JNE diff16 1605 1606 MOVOU 16(SI), X0 1607 MOVOU 16(DI), X1 1608 PCMPEQB X0, X1 1609 PMOVMSKB X1, AX 1610 XORQ $0xffff, AX 1611 JNE diff32 1612 1613 MOVOU 32(SI), X0 1614 MOVOU 32(DI), X1 1615 PCMPEQB X0, X1 1616 PMOVMSKB X1, AX 1617 XORQ $0xffff, AX 1618 JNE diff48 1619 1620 MOVOU 48(SI), X0 1621 MOVOU 48(DI), X1 1622 PCMPEQB X0, X1 1623 PMOVMSKB X1, AX 1624 XORQ $0xffff, AX 1625 JNE diff64 1626 1627 ADDQ $64, SI 1628 ADDQ $64, DI 1629 SUBQ $64, R8 1630 CMPQ R8, $64 1631 JBE loop 1632 JMP big_loop 1633 1634 // Compare 64-bytes per loop iteration. 1635 // Loop is unrolled and uses AVX2. 1636 big_loop_avx2: 1637 VMOVDQU (SI), Y2 1638 VMOVDQU (DI), Y3 1639 VMOVDQU 32(SI), Y4 1640 VMOVDQU 32(DI), Y5 1641 VPCMPEQB Y2, Y3, Y0 1642 VPMOVMSKB Y0, AX 1643 XORL $0xffffffff, AX 1644 JNE diff32_avx2 1645 VPCMPEQB Y4, Y5, Y6 1646 VPMOVMSKB Y6, AX 1647 XORL $0xffffffff, AX 1648 JNE diff64_avx2 1649 1650 ADDQ $64, SI 1651 ADDQ $64, DI 1652 SUBQ $64, R8 1653 CMPQ R8, $64 1654 JB big_loop_avx2_exit 1655 JMP big_loop_avx2 1656 1657 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 1658 diff32_avx2: 1659 VZEROUPPER 1660 JMP diff16 1661 1662 // Same as diff32_avx2, but for last 32 bytes. 1663 diff64_avx2: 1664 VZEROUPPER 1665 JMP diff48 1666 1667 // For <64 bytes remainder jump to normal loop. 1668 big_loop_avx2_exit: 1669 VZEROUPPER 1670 JMP loop 1671 1672 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 1673 MOVQ s+0(FP), DI 1674 // We want len in DX and AX, because PCMPESTRI implicitly consumes them 1675 MOVQ s_len+8(FP), DX 1676 MOVQ c+16(FP), BP 1677 MOVQ c_len+24(FP), AX 1678 MOVQ DI, R10 1679 LEAQ ret+32(FP), R11 1680 JMP runtime·indexShortStr(SB) 1681 1682 TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56 1683 MOVQ s+0(FP), DI 1684 MOVQ s_len+8(FP), DX 1685 MOVQ c+24(FP), BP 1686 MOVQ c_len+32(FP), AX 1687 MOVQ DI, R10 1688 LEAQ ret+48(FP), R11 1689 JMP runtime·indexShortStr(SB) 1690 1691 // AX: length of string, that we are searching for 1692 // DX: length of string, in which we are searching 1693 // DI: pointer to string, in which we are searching 1694 // BP: pointer to string, that we are searching for 1695 // R11: address, where to put return value 1696 TEXT runtime·indexShortStr(SB),NOSPLIT,$0 1697 CMPQ AX, DX 1698 JA fail 1699 CMPQ DX, $16 1700 JAE sse42 1701 no_sse42: 1702 CMPQ AX, $2 1703 JA _3_or_more 1704 MOVW (BP), BP 1705 LEAQ -1(DI)(DX*1), DX 1706 loop2: 1707 MOVW (DI), SI 1708 CMPW SI,BP 1709 JZ success 1710 ADDQ $1,DI 1711 CMPQ DI,DX 1712 JB loop2 1713 JMP fail 1714 _3_or_more: 1715 CMPQ AX, $3 1716 JA _4_or_more 1717 MOVW 1(BP), BX 1718 MOVW (BP), BP 1719 LEAQ -2(DI)(DX*1), DX 1720 loop3: 1721 MOVW (DI), SI 1722 CMPW SI,BP 1723 JZ partial_success3 1724 ADDQ $1,DI 1725 CMPQ DI,DX 1726 JB loop3 1727 JMP fail 1728 partial_success3: 1729 MOVW 1(DI), SI 1730 CMPW SI,BX 1731 JZ success 1732 ADDQ $1,DI 1733 CMPQ DI,DX 1734 JB loop3 1735 JMP fail 1736 _4_or_more: 1737 CMPQ AX, $4 1738 JA _5_or_more 1739 MOVL (BP), BP 1740 LEAQ -3(DI)(DX*1), DX 1741 loop4: 1742 MOVL (DI), SI 1743 CMPL SI,BP 1744 JZ success 1745 ADDQ $1,DI 1746 CMPQ DI,DX 1747 JB loop4 1748 JMP fail 1749 _5_or_more: 1750 CMPQ AX, $7 1751 JA _8_or_more 1752 LEAQ 1(DI)(DX*1), DX 1753 SUBQ AX, DX 1754 MOVL -4(BP)(AX*1), BX 1755 MOVL (BP), BP 1756 loop5to7: 1757 MOVL (DI), SI 1758 CMPL SI,BP 1759 JZ partial_success5to7 1760 ADDQ $1,DI 1761 CMPQ DI,DX 1762 JB loop5to7 1763 JMP fail 1764 partial_success5to7: 1765 MOVL -4(AX)(DI*1), SI 1766 CMPL SI,BX 1767 JZ success 1768 ADDQ $1,DI 1769 CMPQ DI,DX 1770 JB loop5to7 1771 JMP fail 1772 _8_or_more: 1773 CMPQ AX, $8 1774 JA _9_or_more 1775 MOVQ (BP), BP 1776 LEAQ -7(DI)(DX*1), DX 1777 loop8: 1778 MOVQ (DI), SI 1779 CMPQ SI,BP 1780 JZ success 1781 ADDQ $1,DI 1782 CMPQ DI,DX 1783 JB loop8 1784 JMP fail 1785 _9_or_more: 1786 CMPQ AX, $15 1787 JA _16_or_more 1788 LEAQ 1(DI)(DX*1), DX 1789 SUBQ AX, DX 1790 MOVQ -8(BP)(AX*1), BX 1791 MOVQ (BP), BP 1792 loop9to15: 1793 MOVQ (DI), SI 1794 CMPQ SI,BP 1795 JZ partial_success9to15 1796 ADDQ $1,DI 1797 CMPQ DI,DX 1798 JB loop9to15 1799 JMP fail 1800 partial_success9to15: 1801 MOVQ -8(AX)(DI*1), SI 1802 CMPQ SI,BX 1803 JZ success 1804 ADDQ $1,DI 1805 CMPQ DI,DX 1806 JB loop9to15 1807 JMP fail 1808 _16_or_more: 1809 CMPQ AX, $16 1810 JA _17_or_more 1811 MOVOU (BP), X1 1812 LEAQ -15(DI)(DX*1), DX 1813 loop16: 1814 MOVOU (DI), X2 1815 PCMPEQB X1, X2 1816 PMOVMSKB X2, SI 1817 CMPQ SI, $0xffff 1818 JE success 1819 ADDQ $1,DI 1820 CMPQ DI,DX 1821 JB loop16 1822 JMP fail 1823 _17_or_more: 1824 CMPQ AX, $31 1825 JA _32_or_more 1826 LEAQ 1(DI)(DX*1), DX 1827 SUBQ AX, DX 1828 MOVOU -16(BP)(AX*1), X0 1829 MOVOU (BP), X1 1830 loop17to31: 1831 MOVOU (DI), X2 1832 PCMPEQB X1,X2 1833 PMOVMSKB X2, SI 1834 CMPQ SI, $0xffff 1835 JE partial_success17to31 1836 ADDQ $1,DI 1837 CMPQ DI,DX 1838 JB loop17to31 1839 JMP fail 1840 partial_success17to31: 1841 MOVOU -16(AX)(DI*1), X3 1842 PCMPEQB X0, X3 1843 PMOVMSKB X3, SI 1844 CMPQ SI, $0xffff 1845 JE success 1846 ADDQ $1,DI 1847 CMPQ DI,DX 1848 JB loop17to31 1849 JMP fail 1850 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 1851 // So no need to check cpuid 1852 _32_or_more: 1853 CMPQ AX, $32 1854 JA _33_to_63 1855 VMOVDQU (BP), Y1 1856 LEAQ -31(DI)(DX*1), DX 1857 loop32: 1858 VMOVDQU (DI), Y2 1859 VPCMPEQB Y1, Y2, Y3 1860 VPMOVMSKB Y3, SI 1861 CMPL SI, $0xffffffff 1862 JE success_avx2 1863 ADDQ $1,DI 1864 CMPQ DI,DX 1865 JB loop32 1866 JMP fail_avx2 1867 _33_to_63: 1868 LEAQ 1(DI)(DX*1), DX 1869 SUBQ AX, DX 1870 VMOVDQU -32(BP)(AX*1), Y0 1871 VMOVDQU (BP), Y1 1872 loop33to63: 1873 VMOVDQU (DI), Y2 1874 VPCMPEQB Y1, Y2, Y3 1875 VPMOVMSKB Y3, SI 1876 CMPL SI, $0xffffffff 1877 JE partial_success33to63 1878 ADDQ $1,DI 1879 CMPQ DI,DX 1880 JB loop33to63 1881 JMP fail_avx2 1882 partial_success33to63: 1883 VMOVDQU -32(AX)(DI*1), Y3 1884 VPCMPEQB Y0, Y3, Y4 1885 VPMOVMSKB Y4, SI 1886 CMPL SI, $0xffffffff 1887 JE success_avx2 1888 ADDQ $1,DI 1889 CMPQ DI,DX 1890 JB loop33to63 1891 fail_avx2: 1892 VZEROUPPER 1893 fail: 1894 MOVQ $-1, (R11) 1895 RET 1896 success_avx2: 1897 VZEROUPPER 1898 JMP success 1899 sse42: 1900 CMPB runtime·support_sse42(SB), $1 1901 JNE no_sse42 1902 CMPQ AX, $12 1903 // PCMPESTRI is slower than normal compare, 1904 // so using it makes sense only if we advance 4+ bytes per compare 1905 // This value was determined experimentally and is the ~same 1906 // on Nehalem (first with SSE42) and Haswell. 1907 JAE _9_or_more 1908 LEAQ 16(BP), SI 1909 TESTW $0xff0, SI 1910 JEQ no_sse42 1911 MOVOU (BP), X1 1912 LEAQ -15(DI)(DX*1), SI 1913 MOVQ $16, R9 1914 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 1915 loop_sse42: 1916 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 1917 // for equality (bits 2,3 are 11) 1918 // result is not masked or inverted (bits 4,5 are 00) 1919 // and corresponds to first matching byte (bit 6 is 0) 1920 PCMPESTRI $0x0c, (DI), X1 1921 // CX == 16 means no match, 1922 // CX > R9 means partial match at the end of the string, 1923 // otherwise sep is at offset CX from X1 start 1924 CMPQ CX, R9 1925 JBE sse42_success 1926 ADDQ R9, DI 1927 CMPQ DI, SI 1928 JB loop_sse42 1929 PCMPESTRI $0x0c, -1(SI), X1 1930 CMPQ CX, R9 1931 JA fail 1932 LEAQ -1(SI), DI 1933 sse42_success: 1934 ADDQ CX, DI 1935 success: 1936 SUBQ R10, DI 1937 MOVQ DI, (R11) 1938 RET 1939 1940 1941 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 1942 MOVQ s+0(FP), SI 1943 MOVQ s_len+8(FP), BX 1944 MOVB c+24(FP), AL 1945 LEAQ ret+32(FP), R8 1946 JMP runtime·indexbytebody(SB) 1947 1948 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 1949 MOVQ s+0(FP), SI 1950 MOVQ s_len+8(FP), BX 1951 MOVB c+16(FP), AL 1952 LEAQ ret+24(FP), R8 1953 JMP runtime·indexbytebody(SB) 1954 1955 // input: 1956 // SI: data 1957 // BX: data len 1958 // AL: byte sought 1959 // R8: address to put result 1960 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 1961 // Shuffle X0 around so that each byte contains 1962 // the character we're looking for. 1963 MOVD AX, X0 1964 PUNPCKLBW X0, X0 1965 PUNPCKLBW X0, X0 1966 PSHUFL $0, X0, X0 1967 1968 CMPQ BX, $16 1969 JLT small 1970 1971 MOVQ SI, DI 1972 1973 CMPQ BX, $32 1974 JA avx2 1975 sse: 1976 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 1977 JMP sseloopentry 1978 1979 sseloop: 1980 // Move the next 16-byte chunk of the data into X1. 1981 MOVOU (DI), X1 1982 // Compare bytes in X0 to X1. 1983 PCMPEQB X0, X1 1984 // Take the top bit of each byte in X1 and put the result in DX. 1985 PMOVMSKB X1, DX 1986 // Find first set bit, if any. 1987 BSFL DX, DX 1988 JNZ ssesuccess 1989 // Advance to next block. 1990 ADDQ $16, DI 1991 sseloopentry: 1992 CMPQ DI, AX 1993 JB sseloop 1994 1995 // Search the last 16-byte chunk. This chunk may overlap with the 1996 // chunks we've already searched, but that's ok. 1997 MOVQ AX, DI 1998 MOVOU (AX), X1 1999 PCMPEQB X0, X1 2000 PMOVMSKB X1, DX 2001 BSFL DX, DX 2002 JNZ ssesuccess 2003 2004 failure: 2005 MOVQ $-1, (R8) 2006 RET 2007 2008 // We've found a chunk containing the byte. 2009 // The chunk was loaded from DI. 2010 // The index of the matching byte in the chunk is DX. 2011 // The start of the data is SI. 2012 ssesuccess: 2013 SUBQ SI, DI // Compute offset of chunk within data. 2014 ADDQ DX, DI // Add offset of byte within chunk. 2015 MOVQ DI, (R8) 2016 RET 2017 2018 // handle for lengths < 16 2019 small: 2020 TESTQ BX, BX 2021 JEQ failure 2022 2023 // Check if we'll load across a page boundary. 2024 LEAQ 16(SI), AX 2025 TESTW $0xff0, AX 2026 JEQ endofpage 2027 2028 MOVOU (SI), X1 // Load data 2029 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2030 PMOVMSKB X1, DX // Move result bits to integer register. 2031 BSFL DX, DX // Find first set bit. 2032 JZ failure // No set bit, failure. 2033 CMPL DX, BX 2034 JAE failure // Match is past end of data. 2035 MOVQ DX, (R8) 2036 RET 2037 2038 endofpage: 2039 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 2040 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2041 PMOVMSKB X1, DX // Move result bits to integer register. 2042 MOVL BX, CX 2043 SHLL CX, DX 2044 SHRL $16, DX // Shift desired bits down to bottom of register. 2045 BSFL DX, DX // Find first set bit. 2046 JZ failure // No set bit, failure. 2047 MOVQ DX, (R8) 2048 RET 2049 2050 avx2: 2051 CMPB runtime·support_avx2(SB), $1 2052 JNE sse 2053 MOVD AX, X0 2054 LEAQ -32(SI)(BX*1), R11 2055 VPBROADCASTB X0, Y1 2056 avx2_loop: 2057 VMOVDQU (DI), Y2 2058 VPCMPEQB Y1, Y2, Y3 2059 VPTEST Y3, Y3 2060 JNZ avx2success 2061 ADDQ $32, DI 2062 CMPQ DI, R11 2063 JLT avx2_loop 2064 MOVQ R11, DI 2065 VMOVDQU (DI), Y2 2066 VPCMPEQB Y1, Y2, Y3 2067 VPTEST Y3, Y3 2068 JNZ avx2success 2069 VZEROUPPER 2070 MOVQ $-1, (R8) 2071 RET 2072 2073 avx2success: 2074 VPMOVMSKB Y3, DX 2075 BSFL DX, DX 2076 SUBQ SI, DI 2077 ADDQ DI, DX 2078 MOVQ DX, (R8) 2079 VZEROUPPER 2080 RET 2081 2082 TEXT bytes·Equal(SB),NOSPLIT,$0-49 2083 MOVQ a_len+8(FP), BX 2084 MOVQ b_len+32(FP), CX 2085 CMPQ BX, CX 2086 JNE eqret 2087 MOVQ a+0(FP), SI 2088 MOVQ b+24(FP), DI 2089 LEAQ ret+48(FP), AX 2090 JMP runtime·memeqbody(SB) 2091 eqret: 2092 MOVB $0, ret+48(FP) 2093 RET 2094 2095 2096 TEXT bytes·countByte(SB),NOSPLIT,$0-40 2097 MOVQ s+0(FP), SI 2098 MOVQ s_len+8(FP), BX 2099 MOVB c+24(FP), AL 2100 LEAQ ret+32(FP), R8 2101 JMP runtime·countByte(SB) 2102 2103 TEXT strings·countByte(SB),NOSPLIT,$0-32 2104 MOVQ s+0(FP), SI 2105 MOVQ s_len+8(FP), BX 2106 MOVB c+16(FP), AL 2107 LEAQ ret+24(FP), R8 2108 JMP runtime·countByte(SB) 2109 2110 // input: 2111 // SI: data 2112 // BX: data len 2113 // AL: byte sought 2114 // R8: address to put result 2115 // This requires the POPCNT instruction 2116 TEXT runtime·countByte(SB),NOSPLIT,$0 2117 // Shuffle X0 around so that each byte contains 2118 // the character we're looking for. 2119 MOVD AX, X0 2120 PUNPCKLBW X0, X0 2121 PUNPCKLBW X0, X0 2122 PSHUFL $0, X0, X0 2123 2124 CMPQ BX, $16 2125 JLT small 2126 2127 MOVQ $0, R12 // Accumulator 2128 2129 MOVQ SI, DI 2130 2131 CMPQ BX, $32 2132 JA avx2 2133 sse: 2134 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 2135 JMP sseloopentry 2136 2137 sseloop: 2138 // Move the next 16-byte chunk of the data into X1. 2139 MOVOU (DI), X1 2140 // Compare bytes in X0 to X1. 2141 PCMPEQB X0, X1 2142 // Take the top bit of each byte in X1 and put the result in DX. 2143 PMOVMSKB X1, DX 2144 // Count number of matching bytes 2145 POPCNTL DX, DX 2146 // Accumulate into R12 2147 ADDQ DX, R12 2148 // Advance to next block. 2149 ADDQ $16, DI 2150 sseloopentry: 2151 CMPQ DI, AX 2152 JBE sseloop 2153 2154 // Get the number of bytes to consider in the last 16 bytes 2155 ANDQ $15, BX 2156 JZ end 2157 2158 // Create mask to ignore overlap between previous 16 byte block 2159 // and the next. 2160 MOVQ $16,CX 2161 SUBQ BX, CX 2162 MOVQ $0xFFFF, R10 2163 SARQ CL, R10 2164 SALQ CL, R10 2165 2166 // Process the last 16-byte chunk. This chunk may overlap with the 2167 // chunks we've already searched so we need to mask part of it. 2168 MOVOU (AX), X1 2169 PCMPEQB X0, X1 2170 PMOVMSKB X1, DX 2171 // Apply mask 2172 ANDQ R10, DX 2173 POPCNTL DX, DX 2174 ADDQ DX, R12 2175 end: 2176 MOVQ R12, (R8) 2177 RET 2178 2179 // handle for lengths < 16 2180 small: 2181 TESTQ BX, BX 2182 JEQ endzero 2183 2184 // Check if we'll load across a page boundary. 2185 LEAQ 16(SI), AX 2186 TESTW $0xff0, AX 2187 JEQ endofpage 2188 2189 // We must ignore high bytes as they aren't part of our slice. 2190 // Create mask. 2191 MOVB BX, CX 2192 MOVQ $1, R10 2193 SALQ CL, R10 2194 SUBQ $1, R10 2195 2196 // Load data 2197 MOVOU (SI), X1 2198 // Compare target byte with each byte in data. 2199 PCMPEQB X0, X1 2200 // Move result bits to integer register. 2201 PMOVMSKB X1, DX 2202 // Apply mask 2203 ANDQ R10, DX 2204 POPCNTL DX, DX 2205 // Directly return DX, we don't need to accumulate 2206 // since we have <16 bytes. 2207 MOVQ DX, (R8) 2208 RET 2209 endzero: 2210 MOVQ $0, (R8) 2211 RET 2212 2213 endofpage: 2214 // We must ignore low bytes as they aren't part of our slice. 2215 MOVQ $16,CX 2216 SUBQ BX, CX 2217 MOVQ $0xFFFF, R10 2218 SARQ CL, R10 2219 SALQ CL, R10 2220 2221 // Load data into the high end of X1. 2222 MOVOU -16(SI)(BX*1), X1 2223 // Compare target byte with each byte in data. 2224 PCMPEQB X0, X1 2225 // Move result bits to integer register. 2226 PMOVMSKB X1, DX 2227 // Apply mask 2228 ANDQ R10, DX 2229 // Directly return DX, we don't need to accumulate 2230 // since we have <16 bytes. 2231 POPCNTL DX, DX 2232 MOVQ DX, (R8) 2233 RET 2234 2235 avx2: 2236 CMPB runtime·support_avx2(SB), $1 2237 JNE sse 2238 MOVD AX, X0 2239 LEAQ -32(SI)(BX*1), R11 2240 VPBROADCASTB X0, Y1 2241 avx2_loop: 2242 VMOVDQU (DI), Y2 2243 VPCMPEQB Y1, Y2, Y3 2244 VPMOVMSKB Y3, DX 2245 POPCNTL DX, DX 2246 ADDQ DX, R12 2247 ADDQ $32, DI 2248 CMPQ DI, R11 2249 JLE avx2_loop 2250 2251 // If last block is already processed, 2252 // skip to the end. 2253 CMPQ DI, R11 2254 JEQ endavx 2255 2256 // Load address of the last 32 bytes. 2257 // There is an overlap with the previous block. 2258 MOVQ R11, DI 2259 VMOVDQU (DI), Y2 2260 VPCMPEQB Y1, Y2, Y3 2261 VPMOVMSKB Y3, DX 2262 // Exit AVX mode. 2263 VZEROUPPER 2264 2265 // Create mask to ignore overlap between previous 32 byte block 2266 // and the next. 2267 ANDQ $31, BX 2268 MOVQ $32,CX 2269 SUBQ BX, CX 2270 MOVQ $0xFFFFFFFF, R10 2271 SARQ CL, R10 2272 SALQ CL, R10 2273 // Apply mask 2274 ANDQ R10, DX 2275 POPCNTL DX, DX 2276 ADDQ DX, R12 2277 MOVQ R12, (R8) 2278 RET 2279 endavx: 2280 // Exit AVX mode. 2281 VZEROUPPER 2282 MOVQ R12, (R8) 2283 RET 2284 2285 TEXT runtime·return0(SB), NOSPLIT, $0 2286 MOVL $0, AX 2287 RET 2288 2289 2290 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 2291 // Must obey the gcc calling convention. 2292 TEXT _cgo_topofstack(SB),NOSPLIT,$0 2293 get_tls(CX) 2294 MOVQ g(CX), AX 2295 MOVQ g_m(AX), AX 2296 MOVQ m_curg(AX), AX 2297 MOVQ (g_stack+stack_hi)(AX), AX 2298 RET 2299 2300 // The top-most function running on a goroutine 2301 // returns to goexit+PCQuantum. 2302 TEXT runtime·goexit(SB),NOSPLIT,$0-0 2303 BYTE $0x90 // NOP 2304 CALL runtime·goexit1(SB) // does not return 2305 // traceback from goexit1 must hit code range of goexit 2306 BYTE $0x90 // NOP 2307 2308 // This is called from .init_array and follows the platform, not Go, ABI. 2309 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 2310 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 2311 MOVQ runtime·lastmoduledatap(SB), AX 2312 MOVQ DI, moduledata_next(AX) 2313 MOVQ DI, runtime·lastmoduledatap(SB) 2314 POPQ R15 2315 RET