github.com/MangoDowner/go-gm@v0.0.0-20180818020936-8baa2bd4408c/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVL $0, AX 30 CPUID 31 MOVL AX, SI 32 CMPL AX, $0 33 JE nocpuinfo 34 35 // Figure out how to serialize RDTSC. 36 // On Intel processors LFENCE is enough. AMD requires MFENCE. 37 // Don't know about the rest, so let's do MFENCE. 38 CMPL BX, $0x756E6547 // "Genu" 39 JNE notintel 40 CMPL DX, $0x49656E69 // "ineI" 41 JNE notintel 42 CMPL CX, $0x6C65746E // "ntel" 43 JNE notintel 44 MOVB $1, runtime·isIntel(SB) 45 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 46 notintel: 47 48 // Load EAX=1 cpuid flags 49 MOVL $1, AX 50 CPUID 51 MOVL AX, runtime·processorVersionInfo(SB) 52 53 TESTL $(1<<26), DX // SSE2 54 SETNE runtime·support_sse2(SB) 55 56 TESTL $(1<<9), CX // SSSE3 57 SETNE runtime·support_ssse3(SB) 58 59 TESTL $(1<<19), CX // SSE4.1 60 SETNE runtime·support_sse41(SB) 61 62 TESTL $(1<<20), CX // SSE4.2 63 SETNE runtime·support_sse42(SB) 64 65 TESTL $(1<<23), CX // POPCNT 66 SETNE runtime·support_popcnt(SB) 67 68 TESTL $(1<<25), CX // AES 69 SETNE runtime·support_aes(SB) 70 71 TESTL $(1<<27), CX // OSXSAVE 72 SETNE runtime·support_osxsave(SB) 73 74 // If OS support for XMM and YMM is not present 75 // support_avx will be set back to false later. 76 TESTL $(1<<28), CX // AVX 77 SETNE runtime·support_avx(SB) 78 79 eax7: 80 // Load EAX=7/ECX=0 cpuid flags 81 CMPL SI, $7 82 JLT osavx 83 MOVL $7, AX 84 MOVL $0, CX 85 CPUID 86 87 TESTL $(1<<3), BX // BMI1 88 SETNE runtime·support_bmi1(SB) 89 90 // If OS support for XMM and YMM is not present 91 // support_avx2 will be set back to false later. 92 TESTL $(1<<5), BX 93 SETNE runtime·support_avx2(SB) 94 95 TESTL $(1<<8), BX // BMI2 96 SETNE runtime·support_bmi2(SB) 97 98 TESTL $(1<<9), BX // ERMS 99 SETNE runtime·support_erms(SB) 100 101 osavx: 102 CMPB runtime·support_osxsave(SB), $1 103 JNE noavx 104 MOVL $0, CX 105 // For XGETBV, OSXSAVE bit is required and sufficient 106 XGETBV 107 ANDL $6, AX 108 CMPL AX, $6 // Check for OS support of XMM and YMM registers. 109 JE nocpuinfo 110 noavx: 111 MOVB $0, runtime·support_avx(SB) 112 MOVB $0, runtime·support_avx2(SB) 113 114 nocpuinfo: 115 // if there is an _cgo_init, call it. 116 MOVQ _cgo_init(SB), AX 117 TESTQ AX, AX 118 JZ needtls 119 // g0 already in DI 120 MOVQ DI, CX // Win64 uses CX for first parameter 121 MOVQ $setg_gcc<>(SB), SI 122 CALL AX 123 124 // update stackguard after _cgo_init 125 MOVQ $runtime·g0(SB), CX 126 MOVQ (g_stack+stack_lo)(CX), AX 127 ADDQ $const__StackGuard, AX 128 MOVQ AX, g_stackguard0(CX) 129 MOVQ AX, g_stackguard1(CX) 130 131 #ifndef GOOS_windows 132 JMP ok 133 #endif 134 needtls: 135 #ifdef GOOS_plan9 136 // skip TLS setup on Plan 9 137 JMP ok 138 #endif 139 #ifdef GOOS_solaris 140 // skip TLS setup on Solaris 141 JMP ok 142 #endif 143 144 LEAQ runtime·m0+m_tls(SB), DI 145 CALL runtime·settls(SB) 146 147 // store through it, to make sure it works 148 get_tls(BX) 149 MOVQ $0x123, g(BX) 150 MOVQ runtime·m0+m_tls(SB), AX 151 CMPQ AX, $0x123 152 JEQ 2(PC) 153 MOVL AX, 0 // abort 154 ok: 155 // set the per-goroutine and per-mach "registers" 156 get_tls(BX) 157 LEAQ runtime·g0(SB), CX 158 MOVQ CX, g(BX) 159 LEAQ runtime·m0(SB), AX 160 161 // save m->g0 = g0 162 MOVQ CX, m_g0(AX) 163 // save m0 to g0->m 164 MOVQ AX, g_m(CX) 165 166 CLD // convention is D is always left cleared 167 CALL runtime·check(SB) 168 169 MOVL 16(SP), AX // copy argc 170 MOVL AX, 0(SP) 171 MOVQ 24(SP), AX // copy argv 172 MOVQ AX, 8(SP) 173 CALL runtime·args(SB) 174 CALL runtime·osinit(SB) 175 CALL runtime·schedinit(SB) 176 177 // create a new goroutine to start program 178 MOVQ $runtime·mainPC(SB), AX // entry 179 PUSHQ AX 180 PUSHQ $0 // arg size 181 CALL runtime·newproc(SB) 182 POPQ AX 183 POPQ AX 184 185 // start this M 186 CALL runtime·mstart(SB) 187 188 MOVL $0xf1, 0xf1 // crash 189 RET 190 191 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 192 GLOBL runtime·mainPC(SB),RODATA,$8 193 194 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 195 BYTE $0xcc 196 RET 197 198 TEXT runtime·asminit(SB),NOSPLIT,$0-0 199 // No per-thread init. 200 RET 201 202 /* 203 * go-routine 204 */ 205 206 // void gosave(Gobuf*) 207 // save state in Gobuf; setjmp 208 TEXT runtime·gosave(SB), NOSPLIT, $0-8 209 MOVQ buf+0(FP), AX // gobuf 210 LEAQ buf+0(FP), BX // caller's SP 211 MOVQ BX, gobuf_sp(AX) 212 MOVQ 0(SP), BX // caller's PC 213 MOVQ BX, gobuf_pc(AX) 214 MOVQ $0, gobuf_ret(AX) 215 MOVQ BP, gobuf_bp(AX) 216 // Assert ctxt is zero. See func save. 217 MOVQ gobuf_ctxt(AX), BX 218 TESTQ BX, BX 219 JZ 2(PC) 220 CALL runtime·badctxt(SB) 221 get_tls(CX) 222 MOVQ g(CX), BX 223 MOVQ BX, gobuf_g(AX) 224 RET 225 226 // void gogo(Gobuf*) 227 // restore state from Gobuf; longjmp 228 TEXT runtime·gogo(SB), NOSPLIT, $16-8 229 MOVQ buf+0(FP), BX // gobuf 230 231 // If ctxt is not nil, invoke deletion barrier before overwriting. 232 MOVQ gobuf_ctxt(BX), AX 233 TESTQ AX, AX 234 JZ nilctxt 235 LEAQ gobuf_ctxt(BX), AX 236 MOVQ AX, 0(SP) 237 MOVQ $0, 8(SP) 238 CALL runtime·writebarrierptr_prewrite(SB) 239 MOVQ buf+0(FP), BX 240 241 nilctxt: 242 MOVQ gobuf_g(BX), DX 243 MOVQ 0(DX), CX // make sure g != nil 244 get_tls(CX) 245 MOVQ DX, g(CX) 246 MOVQ gobuf_sp(BX), SP // restore SP 247 MOVQ gobuf_ret(BX), AX 248 MOVQ gobuf_ctxt(BX), DX 249 MOVQ gobuf_bp(BX), BP 250 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 251 MOVQ $0, gobuf_ret(BX) 252 MOVQ $0, gobuf_ctxt(BX) 253 MOVQ $0, gobuf_bp(BX) 254 MOVQ gobuf_pc(BX), BX 255 JMP BX 256 257 // func mcall(fn func(*g)) 258 // Switch to m->g0's stack, call fn(g). 259 // Fn must never return. It should gogo(&g->sched) 260 // to keep running g. 261 TEXT runtime·mcall(SB), NOSPLIT, $0-8 262 MOVQ fn+0(FP), DI 263 264 get_tls(CX) 265 MOVQ g(CX), AX // save state in g->sched 266 MOVQ 0(SP), BX // caller's PC 267 MOVQ BX, (g_sched+gobuf_pc)(AX) 268 LEAQ fn+0(FP), BX // caller's SP 269 MOVQ BX, (g_sched+gobuf_sp)(AX) 270 MOVQ AX, (g_sched+gobuf_g)(AX) 271 MOVQ BP, (g_sched+gobuf_bp)(AX) 272 273 // switch to m->g0 & its stack, call fn 274 MOVQ g(CX), BX 275 MOVQ g_m(BX), BX 276 MOVQ m_g0(BX), SI 277 CMPQ SI, AX // if g == m->g0 call badmcall 278 JNE 3(PC) 279 MOVQ $runtime·badmcall(SB), AX 280 JMP AX 281 MOVQ SI, g(CX) // g = m->g0 282 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 283 PUSHQ AX 284 MOVQ DI, DX 285 MOVQ 0(DI), DI 286 CALL DI 287 POPQ AX 288 MOVQ $runtime·badmcall2(SB), AX 289 JMP AX 290 RET 291 292 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 293 // of the G stack. We need to distinguish the routine that 294 // lives at the bottom of the G stack from the one that lives 295 // at the top of the system stack because the one at the top of 296 // the system stack terminates the stack walk (see topofstack()). 297 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 298 RET 299 300 // func systemstack(fn func()) 301 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 302 MOVQ fn+0(FP), DI // DI = fn 303 get_tls(CX) 304 MOVQ g(CX), AX // AX = g 305 MOVQ g_m(AX), BX // BX = m 306 307 MOVQ m_gsignal(BX), DX // DX = gsignal 308 CMPQ AX, DX 309 JEQ noswitch 310 311 MOVQ m_g0(BX), DX // DX = g0 312 CMPQ AX, DX 313 JEQ noswitch 314 315 MOVQ m_curg(BX), R8 316 CMPQ AX, R8 317 JEQ switch 318 319 // Bad: g is not gsignal, not g0, not curg. What is it? 320 MOVQ $runtime·badsystemstack(SB), AX 321 CALL AX 322 323 switch: 324 // save our state in g->sched. Pretend to 325 // be systemstack_switch if the G stack is scanned. 326 MOVQ $runtime·systemstack_switch(SB), SI 327 MOVQ SI, (g_sched+gobuf_pc)(AX) 328 MOVQ SP, (g_sched+gobuf_sp)(AX) 329 MOVQ AX, (g_sched+gobuf_g)(AX) 330 MOVQ BP, (g_sched+gobuf_bp)(AX) 331 332 // switch to g0 333 MOVQ DX, g(CX) 334 MOVQ (g_sched+gobuf_sp)(DX), BX 335 // make it look like mstart called systemstack on g0, to stop traceback 336 SUBQ $8, BX 337 MOVQ $runtime·mstart(SB), DX 338 MOVQ DX, 0(BX) 339 MOVQ BX, SP 340 341 // call target function 342 MOVQ DI, DX 343 MOVQ 0(DI), DI 344 CALL DI 345 346 // switch back to g 347 get_tls(CX) 348 MOVQ g(CX), AX 349 MOVQ g_m(AX), BX 350 MOVQ m_curg(BX), AX 351 MOVQ AX, g(CX) 352 MOVQ (g_sched+gobuf_sp)(AX), SP 353 MOVQ $0, (g_sched+gobuf_sp)(AX) 354 RET 355 356 noswitch: 357 // already on m stack, just call directly 358 MOVQ DI, DX 359 MOVQ 0(DI), DI 360 CALL DI 361 RET 362 363 /* 364 * support for morestack 365 */ 366 367 // Called during function prolog when more stack is needed. 368 // 369 // The traceback routines see morestack on a g0 as being 370 // the top of a stack (for example, morestack calling newstack 371 // calling the scheduler calling newm calling gc), so we must 372 // record an argument size. For that purpose, it has no arguments. 373 TEXT runtime·morestack(SB),NOSPLIT,$0-0 374 // Cannot grow scheduler stack (m->g0). 375 get_tls(CX) 376 MOVQ g(CX), BX 377 MOVQ g_m(BX), BX 378 MOVQ m_g0(BX), SI 379 CMPQ g(CX), SI 380 JNE 3(PC) 381 CALL runtime·badmorestackg0(SB) 382 INT $3 383 384 // Cannot grow signal stack (m->gsignal). 385 MOVQ m_gsignal(BX), SI 386 CMPQ g(CX), SI 387 JNE 3(PC) 388 CALL runtime·badmorestackgsignal(SB) 389 INT $3 390 391 // Called from f. 392 // Set m->morebuf to f's caller. 393 MOVQ 8(SP), AX // f's caller's PC 394 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 395 LEAQ 16(SP), AX // f's caller's SP 396 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 397 get_tls(CX) 398 MOVQ g(CX), SI 399 MOVQ SI, (m_morebuf+gobuf_g)(BX) 400 401 // Set g->sched to context in f. 402 MOVQ 0(SP), AX // f's PC 403 MOVQ AX, (g_sched+gobuf_pc)(SI) 404 MOVQ SI, (g_sched+gobuf_g)(SI) 405 LEAQ 8(SP), AX // f's SP 406 MOVQ AX, (g_sched+gobuf_sp)(SI) 407 MOVQ BP, (g_sched+gobuf_bp)(SI) 408 // newstack will fill gobuf.ctxt. 409 410 // Call newstack on m->g0's stack. 411 MOVQ m_g0(BX), BX 412 MOVQ BX, g(CX) 413 MOVQ (g_sched+gobuf_sp)(BX), SP 414 PUSHQ DX // ctxt argument 415 CALL runtime·newstack(SB) 416 MOVQ $0, 0x1003 // crash if newstack returns 417 POPQ DX // keep balance check happy 418 RET 419 420 // morestack but not preserving ctxt. 421 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 422 MOVL $0, DX 423 JMP runtime·morestack(SB) 424 425 // reflectcall: call a function with the given argument list 426 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 427 // we don't have variable-sized frames, so we use a small number 428 // of constant-sized-frame functions to encode a few bits of size in the pc. 429 // Caution: ugly multiline assembly macros in your future! 430 431 #define DISPATCH(NAME,MAXSIZE) \ 432 CMPQ CX, $MAXSIZE; \ 433 JA 3(PC); \ 434 MOVQ $NAME(SB), AX; \ 435 JMP AX 436 // Note: can't just "JMP NAME(SB)" - bad inlining results. 437 438 TEXT reflect·call(SB), NOSPLIT, $0-0 439 JMP ·reflectcall(SB) 440 441 TEXT ·reflectcall(SB), NOSPLIT, $0-32 442 MOVLQZX argsize+24(FP), CX 443 DISPATCH(runtime·call32, 32) 444 DISPATCH(runtime·call64, 64) 445 DISPATCH(runtime·call128, 128) 446 DISPATCH(runtime·call256, 256) 447 DISPATCH(runtime·call512, 512) 448 DISPATCH(runtime·call1024, 1024) 449 DISPATCH(runtime·call2048, 2048) 450 DISPATCH(runtime·call4096, 4096) 451 DISPATCH(runtime·call8192, 8192) 452 DISPATCH(runtime·call16384, 16384) 453 DISPATCH(runtime·call32768, 32768) 454 DISPATCH(runtime·call65536, 65536) 455 DISPATCH(runtime·call131072, 131072) 456 DISPATCH(runtime·call262144, 262144) 457 DISPATCH(runtime·call524288, 524288) 458 DISPATCH(runtime·call1048576, 1048576) 459 DISPATCH(runtime·call2097152, 2097152) 460 DISPATCH(runtime·call4194304, 4194304) 461 DISPATCH(runtime·call8388608, 8388608) 462 DISPATCH(runtime·call16777216, 16777216) 463 DISPATCH(runtime·call33554432, 33554432) 464 DISPATCH(runtime·call67108864, 67108864) 465 DISPATCH(runtime·call134217728, 134217728) 466 DISPATCH(runtime·call268435456, 268435456) 467 DISPATCH(runtime·call536870912, 536870912) 468 DISPATCH(runtime·call1073741824, 1073741824) 469 MOVQ $runtime·badreflectcall(SB), AX 470 JMP AX 471 472 #define CALLFN(NAME,MAXSIZE) \ 473 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 474 NO_LOCAL_POINTERS; \ 475 /* copy arguments to stack */ \ 476 MOVQ argptr+16(FP), SI; \ 477 MOVLQZX argsize+24(FP), CX; \ 478 MOVQ SP, DI; \ 479 REP;MOVSB; \ 480 /* call function */ \ 481 MOVQ f+8(FP), DX; \ 482 PCDATA $PCDATA_StackMapIndex, $0; \ 483 CALL (DX); \ 484 /* copy return values back */ \ 485 MOVQ argtype+0(FP), DX; \ 486 MOVQ argptr+16(FP), DI; \ 487 MOVLQZX argsize+24(FP), CX; \ 488 MOVLQZX retoffset+28(FP), BX; \ 489 MOVQ SP, SI; \ 490 ADDQ BX, DI; \ 491 ADDQ BX, SI; \ 492 SUBQ BX, CX; \ 493 CALL callRet<>(SB); \ 494 RET 495 496 // callRet copies return values back at the end of call*. This is a 497 // separate function so it can allocate stack space for the arguments 498 // to reflectcallmove. It does not follow the Go ABI; it expects its 499 // arguments in registers. 500 TEXT callRet<>(SB), NOSPLIT, $32-0 501 NO_LOCAL_POINTERS 502 MOVQ DX, 0(SP) 503 MOVQ DI, 8(SP) 504 MOVQ SI, 16(SP) 505 MOVQ CX, 24(SP) 506 CALL runtime·reflectcallmove(SB) 507 RET 508 509 CALLFN(·call32, 32) 510 CALLFN(·call64, 64) 511 CALLFN(·call128, 128) 512 CALLFN(·call256, 256) 513 CALLFN(·call512, 512) 514 CALLFN(·call1024, 1024) 515 CALLFN(·call2048, 2048) 516 CALLFN(·call4096, 4096) 517 CALLFN(·call8192, 8192) 518 CALLFN(·call16384, 16384) 519 CALLFN(·call32768, 32768) 520 CALLFN(·call65536, 65536) 521 CALLFN(·call131072, 131072) 522 CALLFN(·call262144, 262144) 523 CALLFN(·call524288, 524288) 524 CALLFN(·call1048576, 1048576) 525 CALLFN(·call2097152, 2097152) 526 CALLFN(·call4194304, 4194304) 527 CALLFN(·call8388608, 8388608) 528 CALLFN(·call16777216, 16777216) 529 CALLFN(·call33554432, 33554432) 530 CALLFN(·call67108864, 67108864) 531 CALLFN(·call134217728, 134217728) 532 CALLFN(·call268435456, 268435456) 533 CALLFN(·call536870912, 536870912) 534 CALLFN(·call1073741824, 1073741824) 535 536 TEXT runtime·procyield(SB),NOSPLIT,$0-0 537 MOVL cycles+0(FP), AX 538 again: 539 PAUSE 540 SUBL $1, AX 541 JNZ again 542 RET 543 544 545 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 546 // Stores are already ordered on x86, so this is just a 547 // compile barrier. 548 RET 549 550 // void jmpdefer(fn, sp); 551 // called from deferreturn. 552 // 1. pop the caller 553 // 2. sub 5 bytes from the callers return 554 // 3. jmp to the argument 555 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 556 MOVQ fv+0(FP), DX // fn 557 MOVQ argp+8(FP), BX // caller sp 558 LEAQ -8(BX), SP // caller sp after CALL 559 MOVQ -8(SP), BP // restore BP as if deferreturn returned (harmless if framepointers not in use) 560 SUBQ $5, (SP) // return to CALL again 561 MOVQ 0(DX), BX 562 JMP BX // but first run the deferred function 563 564 // Save state of caller into g->sched. Smashes R8, R9. 565 TEXT gosave<>(SB),NOSPLIT,$0 566 get_tls(R8) 567 MOVQ g(R8), R8 568 MOVQ 0(SP), R9 569 MOVQ R9, (g_sched+gobuf_pc)(R8) 570 LEAQ 8(SP), R9 571 MOVQ R9, (g_sched+gobuf_sp)(R8) 572 MOVQ $0, (g_sched+gobuf_ret)(R8) 573 MOVQ BP, (g_sched+gobuf_bp)(R8) 574 // Assert ctxt is zero. See func save. 575 MOVQ (g_sched+gobuf_ctxt)(R8), R9 576 TESTQ R9, R9 577 JZ 2(PC) 578 CALL runtime·badctxt(SB) 579 RET 580 581 // func asmcgocall(fn, arg unsafe.Pointer) int32 582 // Call fn(arg) on the scheduler stack, 583 // aligned appropriately for the gcc ABI. 584 // See cgocall.go for more details. 585 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 586 MOVQ fn+0(FP), AX 587 MOVQ arg+8(FP), BX 588 589 MOVQ SP, DX 590 591 // Figure out if we need to switch to m->g0 stack. 592 // We get called to create new OS threads too, and those 593 // come in on the m->g0 stack already. 594 get_tls(CX) 595 MOVQ g(CX), R8 596 CMPQ R8, $0 597 JEQ nosave 598 MOVQ g_m(R8), R8 599 MOVQ m_g0(R8), SI 600 MOVQ g(CX), DI 601 CMPQ SI, DI 602 JEQ nosave 603 MOVQ m_gsignal(R8), SI 604 CMPQ SI, DI 605 JEQ nosave 606 607 // Switch to system stack. 608 MOVQ m_g0(R8), SI 609 CALL gosave<>(SB) 610 MOVQ SI, g(CX) 611 MOVQ (g_sched+gobuf_sp)(SI), SP 612 613 // Now on a scheduling stack (a pthread-created stack). 614 // Make sure we have enough room for 4 stack-backed fast-call 615 // registers as per windows amd64 calling convention. 616 SUBQ $64, SP 617 ANDQ $~15, SP // alignment for gcc ABI 618 MOVQ DI, 48(SP) // save g 619 MOVQ (g_stack+stack_hi)(DI), DI 620 SUBQ DX, DI 621 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 622 MOVQ BX, DI // DI = first argument in AMD64 ABI 623 MOVQ BX, CX // CX = first argument in Win64 624 CALL AX 625 626 // Restore registers, g, stack pointer. 627 get_tls(CX) 628 MOVQ 48(SP), DI 629 MOVQ (g_stack+stack_hi)(DI), SI 630 SUBQ 40(SP), SI 631 MOVQ DI, g(CX) 632 MOVQ SI, SP 633 634 MOVL AX, ret+16(FP) 635 RET 636 637 nosave: 638 // Running on a system stack, perhaps even without a g. 639 // Having no g can happen during thread creation or thread teardown 640 // (see needm/dropm on Solaris, for example). 641 // This code is like the above sequence but without saving/restoring g 642 // and without worrying about the stack moving out from under us 643 // (because we're on a system stack, not a goroutine stack). 644 // The above code could be used directly if already on a system stack, 645 // but then the only path through this code would be a rare case on Solaris. 646 // Using this code for all "already on system stack" calls exercises it more, 647 // which should help keep it correct. 648 SUBQ $64, SP 649 ANDQ $~15, SP 650 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging 651 MOVQ DX, 40(SP) // save original stack pointer 652 MOVQ BX, DI // DI = first argument in AMD64 ABI 653 MOVQ BX, CX // CX = first argument in Win64 654 CALL AX 655 MOVQ 40(SP), SI // restore original stack pointer 656 MOVQ SI, SP 657 MOVL AX, ret+16(FP) 658 RET 659 660 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt) 661 // Turn the fn into a Go func (by taking its address) and call 662 // cgocallback_gofunc. 663 TEXT runtime·cgocallback(SB),NOSPLIT,$32-32 664 LEAQ fn+0(FP), AX 665 MOVQ AX, 0(SP) 666 MOVQ frame+8(FP), AX 667 MOVQ AX, 8(SP) 668 MOVQ framesize+16(FP), AX 669 MOVQ AX, 16(SP) 670 MOVQ ctxt+24(FP), AX 671 MOVQ AX, 24(SP) 672 MOVQ $runtime·cgocallback_gofunc(SB), AX 673 CALL AX 674 RET 675 676 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt) 677 // See cgocall.go for more details. 678 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32 679 NO_LOCAL_POINTERS 680 681 // If g is nil, Go did not create the current thread. 682 // Call needm to obtain one m for temporary use. 683 // In this case, we're running on the thread stack, so there's 684 // lots of space, but the linker doesn't know. Hide the call from 685 // the linker analysis by using an indirect call through AX. 686 get_tls(CX) 687 #ifdef GOOS_windows 688 MOVL $0, BX 689 CMPQ CX, $0 690 JEQ 2(PC) 691 #endif 692 MOVQ g(CX), BX 693 CMPQ BX, $0 694 JEQ needm 695 MOVQ g_m(BX), BX 696 MOVQ BX, R8 // holds oldm until end of function 697 JMP havem 698 needm: 699 MOVQ $0, 0(SP) 700 MOVQ $runtime·needm(SB), AX 701 CALL AX 702 MOVQ 0(SP), R8 703 get_tls(CX) 704 MOVQ g(CX), BX 705 MOVQ g_m(BX), BX 706 707 // Set m->sched.sp = SP, so that if a panic happens 708 // during the function we are about to execute, it will 709 // have a valid SP to run on the g0 stack. 710 // The next few lines (after the havem label) 711 // will save this SP onto the stack and then write 712 // the same SP back to m->sched.sp. That seems redundant, 713 // but if an unrecovered panic happens, unwindm will 714 // restore the g->sched.sp from the stack location 715 // and then systemstack will try to use it. If we don't set it here, 716 // that restored SP will be uninitialized (typically 0) and 717 // will not be usable. 718 MOVQ m_g0(BX), SI 719 MOVQ SP, (g_sched+gobuf_sp)(SI) 720 721 havem: 722 // Now there's a valid m, and we're running on its m->g0. 723 // Save current m->g0->sched.sp on stack and then set it to SP. 724 // Save current sp in m->g0->sched.sp in preparation for 725 // switch back to m->curg stack. 726 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 727 MOVQ m_g0(BX), SI 728 MOVQ (g_sched+gobuf_sp)(SI), AX 729 MOVQ AX, 0(SP) 730 MOVQ SP, (g_sched+gobuf_sp)(SI) 731 732 // Switch to m->curg stack and call runtime.cgocallbackg. 733 // Because we are taking over the execution of m->curg 734 // but *not* resuming what had been running, we need to 735 // save that information (m->curg->sched) so we can restore it. 736 // We can restore m->curg->sched.sp easily, because calling 737 // runtime.cgocallbackg leaves SP unchanged upon return. 738 // To save m->curg->sched.pc, we push it onto the stack. 739 // This has the added benefit that it looks to the traceback 740 // routine like cgocallbackg is going to return to that 741 // PC (because the frame we allocate below has the same 742 // size as cgocallback_gofunc's frame declared above) 743 // so that the traceback will seamlessly trace back into 744 // the earlier calls. 745 // 746 // In the new goroutine, 8(SP) holds the saved R8. 747 MOVQ m_curg(BX), SI 748 MOVQ SI, g(CX) 749 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 750 MOVQ (g_sched+gobuf_pc)(SI), BX 751 MOVQ BX, -8(DI) 752 // Compute the size of the frame, including return PC and, if 753 // GOEXPERIMENT=framepointer, the saved base pointer 754 MOVQ ctxt+24(FP), BX 755 LEAQ fv+0(FP), AX 756 SUBQ SP, AX 757 SUBQ AX, DI 758 MOVQ DI, SP 759 760 MOVQ R8, 8(SP) 761 MOVQ BX, 0(SP) 762 CALL runtime·cgocallbackg(SB) 763 MOVQ 8(SP), R8 764 765 // Compute the size of the frame again. FP and SP have 766 // completely different values here than they did above, 767 // but only their difference matters. 768 LEAQ fv+0(FP), AX 769 SUBQ SP, AX 770 771 // Restore g->sched (== m->curg->sched) from saved values. 772 get_tls(CX) 773 MOVQ g(CX), SI 774 MOVQ SP, DI 775 ADDQ AX, DI 776 MOVQ -8(DI), BX 777 MOVQ BX, (g_sched+gobuf_pc)(SI) 778 MOVQ DI, (g_sched+gobuf_sp)(SI) 779 780 // Switch back to m->g0's stack and restore m->g0->sched.sp. 781 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 782 // so we do not have to restore it.) 783 MOVQ g(CX), BX 784 MOVQ g_m(BX), BX 785 MOVQ m_g0(BX), SI 786 MOVQ SI, g(CX) 787 MOVQ (g_sched+gobuf_sp)(SI), SP 788 MOVQ 0(SP), AX 789 MOVQ AX, (g_sched+gobuf_sp)(SI) 790 791 // If the m on entry was nil, we called needm above to borrow an m 792 // for the duration of the call. Since the call is over, return it with dropm. 793 CMPQ R8, $0 794 JNE 3(PC) 795 MOVQ $runtime·dropm(SB), AX 796 CALL AX 797 798 // Done! 799 RET 800 801 // void setg(G*); set g. for use by needm. 802 TEXT runtime·setg(SB), NOSPLIT, $0-8 803 MOVQ gg+0(FP), BX 804 #ifdef GOOS_windows 805 CMPQ BX, $0 806 JNE settls 807 MOVQ $0, 0x28(GS) 808 RET 809 settls: 810 MOVQ g_m(BX), AX 811 LEAQ m_tls(AX), AX 812 MOVQ AX, 0x28(GS) 813 #endif 814 get_tls(CX) 815 MOVQ BX, g(CX) 816 RET 817 818 // void setg_gcc(G*); set g called from gcc. 819 TEXT setg_gcc<>(SB),NOSPLIT,$0 820 get_tls(AX) 821 MOVQ DI, g(AX) 822 RET 823 824 // check that SP is in range [g->stack.lo, g->stack.hi) 825 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 826 get_tls(CX) 827 MOVQ g(CX), AX 828 CMPQ (g_stack+stack_hi)(AX), SP 829 JHI 2(PC) 830 INT $3 831 CMPQ SP, (g_stack+stack_lo)(AX) 832 JHI 2(PC) 833 INT $3 834 RET 835 836 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 837 MOVQ argp+0(FP),AX // addr of first arg 838 MOVQ -8(AX),AX // get calling pc 839 MOVQ AX, ret+8(FP) 840 RET 841 842 // func cputicks() int64 843 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 844 CMPB runtime·lfenceBeforeRdtsc(SB), $1 845 JNE mfence 846 LFENCE 847 JMP done 848 mfence: 849 MFENCE 850 done: 851 RDTSC 852 SHLQ $32, DX 853 ADDQ DX, AX 854 MOVQ AX, ret+0(FP) 855 RET 856 857 // memhash_varlen(p unsafe.Pointer, h seed) uintptr 858 // redirects to memhash(p, h, size) using the size 859 // stored in the closure. 860 TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24 861 GO_ARGS 862 NO_LOCAL_POINTERS 863 MOVQ p+0(FP), AX 864 MOVQ h+8(FP), BX 865 MOVQ 8(DX), CX 866 MOVQ AX, 0(SP) 867 MOVQ BX, 8(SP) 868 MOVQ CX, 16(SP) 869 CALL runtime·memhash(SB) 870 MOVQ 24(SP), AX 871 MOVQ AX, ret+16(FP) 872 RET 873 874 // hash function using AES hardware instructions 875 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 876 MOVQ p+0(FP), AX // ptr to data 877 MOVQ s+16(FP), CX // size 878 LEAQ ret+24(FP), DX 879 JMP runtime·aeshashbody(SB) 880 881 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 882 MOVQ p+0(FP), AX // ptr to string struct 883 MOVQ 8(AX), CX // length of string 884 MOVQ (AX), AX // string data 885 LEAQ ret+16(FP), DX 886 JMP runtime·aeshashbody(SB) 887 888 // AX: data 889 // CX: length 890 // DX: address to put return value 891 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 892 // Fill an SSE register with our seeds. 893 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 894 PINSRW $4, CX, X0 // 16 bits of length 895 PSHUFHW $0, X0, X0 // repeat length 4 times total 896 MOVO X0, X1 // save unscrambled seed 897 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 898 AESENC X0, X0 // scramble seed 899 900 CMPQ CX, $16 901 JB aes0to15 902 JE aes16 903 CMPQ CX, $32 904 JBE aes17to32 905 CMPQ CX, $64 906 JBE aes33to64 907 CMPQ CX, $128 908 JBE aes65to128 909 JMP aes129plus 910 911 aes0to15: 912 TESTQ CX, CX 913 JE aes0 914 915 ADDQ $16, AX 916 TESTW $0xff0, AX 917 JE endofpage 918 919 // 16 bytes loaded at this address won't cross 920 // a page boundary, so we can load it directly. 921 MOVOU -16(AX), X1 922 ADDQ CX, CX 923 MOVQ $masks<>(SB), AX 924 PAND (AX)(CX*8), X1 925 final1: 926 PXOR X0, X1 // xor data with seed 927 AESENC X1, X1 // scramble combo 3 times 928 AESENC X1, X1 929 AESENC X1, X1 930 MOVQ X1, (DX) 931 RET 932 933 endofpage: 934 // address ends in 1111xxxx. Might be up against 935 // a page boundary, so load ending at last byte. 936 // Then shift bytes down using pshufb. 937 MOVOU -32(AX)(CX*1), X1 938 ADDQ CX, CX 939 MOVQ $shifts<>(SB), AX 940 PSHUFB (AX)(CX*8), X1 941 JMP final1 942 943 aes0: 944 // Return scrambled input seed 945 AESENC X0, X0 946 MOVQ X0, (DX) 947 RET 948 949 aes16: 950 MOVOU (AX), X1 951 JMP final1 952 953 aes17to32: 954 // make second starting seed 955 PXOR runtime·aeskeysched+16(SB), X1 956 AESENC X1, X1 957 958 // load data to be hashed 959 MOVOU (AX), X2 960 MOVOU -16(AX)(CX*1), X3 961 962 // xor with seed 963 PXOR X0, X2 964 PXOR X1, X3 965 966 // scramble 3 times 967 AESENC X2, X2 968 AESENC X3, X3 969 AESENC X2, X2 970 AESENC X3, X3 971 AESENC X2, X2 972 AESENC X3, X3 973 974 // combine results 975 PXOR X3, X2 976 MOVQ X2, (DX) 977 RET 978 979 aes33to64: 980 // make 3 more starting seeds 981 MOVO X1, X2 982 MOVO X1, X3 983 PXOR runtime·aeskeysched+16(SB), X1 984 PXOR runtime·aeskeysched+32(SB), X2 985 PXOR runtime·aeskeysched+48(SB), X3 986 AESENC X1, X1 987 AESENC X2, X2 988 AESENC X3, X3 989 990 MOVOU (AX), X4 991 MOVOU 16(AX), X5 992 MOVOU -32(AX)(CX*1), X6 993 MOVOU -16(AX)(CX*1), X7 994 995 PXOR X0, X4 996 PXOR X1, X5 997 PXOR X2, X6 998 PXOR X3, X7 999 1000 AESENC X4, X4 1001 AESENC X5, X5 1002 AESENC X6, X6 1003 AESENC X7, X7 1004 1005 AESENC X4, X4 1006 AESENC X5, X5 1007 AESENC X6, X6 1008 AESENC X7, X7 1009 1010 AESENC X4, X4 1011 AESENC X5, X5 1012 AESENC X6, X6 1013 AESENC X7, X7 1014 1015 PXOR X6, X4 1016 PXOR X7, X5 1017 PXOR X5, X4 1018 MOVQ X4, (DX) 1019 RET 1020 1021 aes65to128: 1022 // make 7 more starting seeds 1023 MOVO X1, X2 1024 MOVO X1, X3 1025 MOVO X1, X4 1026 MOVO X1, X5 1027 MOVO X1, X6 1028 MOVO X1, X7 1029 PXOR runtime·aeskeysched+16(SB), X1 1030 PXOR runtime·aeskeysched+32(SB), X2 1031 PXOR runtime·aeskeysched+48(SB), X3 1032 PXOR runtime·aeskeysched+64(SB), X4 1033 PXOR runtime·aeskeysched+80(SB), X5 1034 PXOR runtime·aeskeysched+96(SB), X6 1035 PXOR runtime·aeskeysched+112(SB), X7 1036 AESENC X1, X1 1037 AESENC X2, X2 1038 AESENC X3, X3 1039 AESENC X4, X4 1040 AESENC X5, X5 1041 AESENC X6, X6 1042 AESENC X7, X7 1043 1044 // load data 1045 MOVOU (AX), X8 1046 MOVOU 16(AX), X9 1047 MOVOU 32(AX), X10 1048 MOVOU 48(AX), X11 1049 MOVOU -64(AX)(CX*1), X12 1050 MOVOU -48(AX)(CX*1), X13 1051 MOVOU -32(AX)(CX*1), X14 1052 MOVOU -16(AX)(CX*1), X15 1053 1054 // xor with seed 1055 PXOR X0, X8 1056 PXOR X1, X9 1057 PXOR X2, X10 1058 PXOR X3, X11 1059 PXOR X4, X12 1060 PXOR X5, X13 1061 PXOR X6, X14 1062 PXOR X7, X15 1063 1064 // scramble 3 times 1065 AESENC X8, X8 1066 AESENC X9, X9 1067 AESENC X10, X10 1068 AESENC X11, X11 1069 AESENC X12, X12 1070 AESENC X13, X13 1071 AESENC X14, X14 1072 AESENC X15, X15 1073 1074 AESENC X8, X8 1075 AESENC X9, X9 1076 AESENC X10, X10 1077 AESENC X11, X11 1078 AESENC X12, X12 1079 AESENC X13, X13 1080 AESENC X14, X14 1081 AESENC X15, X15 1082 1083 AESENC X8, X8 1084 AESENC X9, X9 1085 AESENC X10, X10 1086 AESENC X11, X11 1087 AESENC X12, X12 1088 AESENC X13, X13 1089 AESENC X14, X14 1090 AESENC X15, X15 1091 1092 // combine results 1093 PXOR X12, X8 1094 PXOR X13, X9 1095 PXOR X14, X10 1096 PXOR X15, X11 1097 PXOR X10, X8 1098 PXOR X11, X9 1099 PXOR X9, X8 1100 MOVQ X8, (DX) 1101 RET 1102 1103 aes129plus: 1104 // make 7 more starting seeds 1105 MOVO X1, X2 1106 MOVO X1, X3 1107 MOVO X1, X4 1108 MOVO X1, X5 1109 MOVO X1, X6 1110 MOVO X1, X7 1111 PXOR runtime·aeskeysched+16(SB), X1 1112 PXOR runtime·aeskeysched+32(SB), X2 1113 PXOR runtime·aeskeysched+48(SB), X3 1114 PXOR runtime·aeskeysched+64(SB), X4 1115 PXOR runtime·aeskeysched+80(SB), X5 1116 PXOR runtime·aeskeysched+96(SB), X6 1117 PXOR runtime·aeskeysched+112(SB), X7 1118 AESENC X1, X1 1119 AESENC X2, X2 1120 AESENC X3, X3 1121 AESENC X4, X4 1122 AESENC X5, X5 1123 AESENC X6, X6 1124 AESENC X7, X7 1125 1126 // start with last (possibly overlapping) block 1127 MOVOU -128(AX)(CX*1), X8 1128 MOVOU -112(AX)(CX*1), X9 1129 MOVOU -96(AX)(CX*1), X10 1130 MOVOU -80(AX)(CX*1), X11 1131 MOVOU -64(AX)(CX*1), X12 1132 MOVOU -48(AX)(CX*1), X13 1133 MOVOU -32(AX)(CX*1), X14 1134 MOVOU -16(AX)(CX*1), X15 1135 1136 // xor in seed 1137 PXOR X0, X8 1138 PXOR X1, X9 1139 PXOR X2, X10 1140 PXOR X3, X11 1141 PXOR X4, X12 1142 PXOR X5, X13 1143 PXOR X6, X14 1144 PXOR X7, X15 1145 1146 // compute number of remaining 128-byte blocks 1147 DECQ CX 1148 SHRQ $7, CX 1149 1150 aesloop: 1151 // scramble state 1152 AESENC X8, X8 1153 AESENC X9, X9 1154 AESENC X10, X10 1155 AESENC X11, X11 1156 AESENC X12, X12 1157 AESENC X13, X13 1158 AESENC X14, X14 1159 AESENC X15, X15 1160 1161 // scramble state, xor in a block 1162 MOVOU (AX), X0 1163 MOVOU 16(AX), X1 1164 MOVOU 32(AX), X2 1165 MOVOU 48(AX), X3 1166 AESENC X0, X8 1167 AESENC X1, X9 1168 AESENC X2, X10 1169 AESENC X3, X11 1170 MOVOU 64(AX), X4 1171 MOVOU 80(AX), X5 1172 MOVOU 96(AX), X6 1173 MOVOU 112(AX), X7 1174 AESENC X4, X12 1175 AESENC X5, X13 1176 AESENC X6, X14 1177 AESENC X7, X15 1178 1179 ADDQ $128, AX 1180 DECQ CX 1181 JNE aesloop 1182 1183 // 3 more scrambles to finish 1184 AESENC X8, X8 1185 AESENC X9, X9 1186 AESENC X10, X10 1187 AESENC X11, X11 1188 AESENC X12, X12 1189 AESENC X13, X13 1190 AESENC X14, X14 1191 AESENC X15, X15 1192 AESENC X8, X8 1193 AESENC X9, X9 1194 AESENC X10, X10 1195 AESENC X11, X11 1196 AESENC X12, X12 1197 AESENC X13, X13 1198 AESENC X14, X14 1199 AESENC X15, X15 1200 AESENC X8, X8 1201 AESENC X9, X9 1202 AESENC X10, X10 1203 AESENC X11, X11 1204 AESENC X12, X12 1205 AESENC X13, X13 1206 AESENC X14, X14 1207 AESENC X15, X15 1208 1209 PXOR X12, X8 1210 PXOR X13, X9 1211 PXOR X14, X10 1212 PXOR X15, X11 1213 PXOR X10, X8 1214 PXOR X11, X9 1215 PXOR X9, X8 1216 MOVQ X8, (DX) 1217 RET 1218 1219 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1220 MOVQ p+0(FP), AX // ptr to data 1221 MOVQ h+8(FP), X0 // seed 1222 PINSRD $2, (AX), X0 // data 1223 AESENC runtime·aeskeysched+0(SB), X0 1224 AESENC runtime·aeskeysched+16(SB), X0 1225 AESENC runtime·aeskeysched+32(SB), X0 1226 MOVQ X0, ret+16(FP) 1227 RET 1228 1229 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1230 MOVQ p+0(FP), AX // ptr to data 1231 MOVQ h+8(FP), X0 // seed 1232 PINSRQ $1, (AX), X0 // data 1233 AESENC runtime·aeskeysched+0(SB), X0 1234 AESENC runtime·aeskeysched+16(SB), X0 1235 AESENC runtime·aeskeysched+32(SB), X0 1236 MOVQ X0, ret+16(FP) 1237 RET 1238 1239 // simple mask to get rid of data in the high part of the register. 1240 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1241 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1242 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1243 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1244 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1245 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1246 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1247 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1248 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1249 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1250 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1251 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1252 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1253 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1254 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1255 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1256 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1257 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1258 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1259 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1260 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1261 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1262 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1263 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1264 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1265 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1266 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1267 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1268 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1269 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1270 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1271 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1272 GLOBL masks<>(SB),RODATA,$256 1273 1274 TEXT ·checkASM(SB),NOSPLIT,$0-1 1275 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte 1276 MOVQ $masks<>(SB), AX 1277 MOVQ $shifts<>(SB), BX 1278 ORQ BX, AX 1279 TESTQ $15, AX 1280 SETEQ ret+0(FP) 1281 RET 1282 1283 // these are arguments to pshufb. They move data down from 1284 // the high bytes of the register to the low bytes of the register. 1285 // index is how many bytes to move. 1286 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1287 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1288 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1289 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1290 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1291 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1292 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1293 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1294 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1295 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1296 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1297 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1298 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1299 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1300 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1301 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1302 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1303 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1304 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1305 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1306 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1307 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1308 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1309 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1310 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1311 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1312 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1313 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1314 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1315 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1316 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1317 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1318 GLOBL shifts<>(SB),RODATA,$256 1319 1320 // memequal(p, q unsafe.Pointer, size uintptr) bool 1321 TEXT runtime·memequal(SB),NOSPLIT,$0-25 1322 MOVQ a+0(FP), SI 1323 MOVQ b+8(FP), DI 1324 CMPQ SI, DI 1325 JEQ eq 1326 MOVQ size+16(FP), BX 1327 LEAQ ret+24(FP), AX 1328 JMP runtime·memeqbody(SB) 1329 eq: 1330 MOVB $1, ret+24(FP) 1331 RET 1332 1333 // memequal_varlen(a, b unsafe.Pointer) bool 1334 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1335 MOVQ a+0(FP), SI 1336 MOVQ b+8(FP), DI 1337 CMPQ SI, DI 1338 JEQ eq 1339 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1340 LEAQ ret+16(FP), AX 1341 JMP runtime·memeqbody(SB) 1342 eq: 1343 MOVB $1, ret+16(FP) 1344 RET 1345 1346 // eqstring tests whether two strings are equal. 1347 // The compiler guarantees that strings passed 1348 // to eqstring have equal length. 1349 // See runtime_test.go:eqstring_generic for 1350 // equivalent Go code. 1351 TEXT runtime·eqstring(SB),NOSPLIT,$0-33 1352 MOVQ s1_base+0(FP), SI 1353 MOVQ s2_base+16(FP), DI 1354 CMPQ SI, DI 1355 JEQ eq 1356 MOVQ s1_len+8(FP), BX 1357 LEAQ ret+32(FP), AX 1358 JMP runtime·memeqbody(SB) 1359 eq: 1360 MOVB $1, ret+32(FP) 1361 RET 1362 1363 // a in SI 1364 // b in DI 1365 // count in BX 1366 // address of result byte in AX 1367 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1368 CMPQ BX, $8 1369 JB small 1370 CMPQ BX, $64 1371 JB bigloop 1372 CMPB runtime·support_avx2(SB), $1 1373 JE hugeloop_avx2 1374 1375 // 64 bytes at a time using xmm registers 1376 hugeloop: 1377 CMPQ BX, $64 1378 JB bigloop 1379 MOVOU (SI), X0 1380 MOVOU (DI), X1 1381 MOVOU 16(SI), X2 1382 MOVOU 16(DI), X3 1383 MOVOU 32(SI), X4 1384 MOVOU 32(DI), X5 1385 MOVOU 48(SI), X6 1386 MOVOU 48(DI), X7 1387 PCMPEQB X1, X0 1388 PCMPEQB X3, X2 1389 PCMPEQB X5, X4 1390 PCMPEQB X7, X6 1391 PAND X2, X0 1392 PAND X6, X4 1393 PAND X4, X0 1394 PMOVMSKB X0, DX 1395 ADDQ $64, SI 1396 ADDQ $64, DI 1397 SUBQ $64, BX 1398 CMPL DX, $0xffff 1399 JEQ hugeloop 1400 MOVB $0, (AX) 1401 RET 1402 1403 // 64 bytes at a time using ymm registers 1404 hugeloop_avx2: 1405 CMPQ BX, $64 1406 JB bigloop_avx2 1407 VMOVDQU (SI), Y0 1408 VMOVDQU (DI), Y1 1409 VMOVDQU 32(SI), Y2 1410 VMOVDQU 32(DI), Y3 1411 VPCMPEQB Y1, Y0, Y4 1412 VPCMPEQB Y2, Y3, Y5 1413 VPAND Y4, Y5, Y6 1414 VPMOVMSKB Y6, DX 1415 ADDQ $64, SI 1416 ADDQ $64, DI 1417 SUBQ $64, BX 1418 CMPL DX, $0xffffffff 1419 JEQ hugeloop_avx2 1420 VZEROUPPER 1421 MOVB $0, (AX) 1422 RET 1423 1424 bigloop_avx2: 1425 VZEROUPPER 1426 1427 // 8 bytes at a time using 64-bit register 1428 bigloop: 1429 CMPQ BX, $8 1430 JBE leftover 1431 MOVQ (SI), CX 1432 MOVQ (DI), DX 1433 ADDQ $8, SI 1434 ADDQ $8, DI 1435 SUBQ $8, BX 1436 CMPQ CX, DX 1437 JEQ bigloop 1438 MOVB $0, (AX) 1439 RET 1440 1441 // remaining 0-8 bytes 1442 leftover: 1443 MOVQ -8(SI)(BX*1), CX 1444 MOVQ -8(DI)(BX*1), DX 1445 CMPQ CX, DX 1446 SETEQ (AX) 1447 RET 1448 1449 small: 1450 CMPQ BX, $0 1451 JEQ equal 1452 1453 LEAQ 0(BX*8), CX 1454 NEGQ CX 1455 1456 CMPB SI, $0xf8 1457 JA si_high 1458 1459 // load at SI won't cross a page boundary. 1460 MOVQ (SI), SI 1461 JMP si_finish 1462 si_high: 1463 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1464 MOVQ -8(SI)(BX*1), SI 1465 SHRQ CX, SI 1466 si_finish: 1467 1468 // same for DI. 1469 CMPB DI, $0xf8 1470 JA di_high 1471 MOVQ (DI), DI 1472 JMP di_finish 1473 di_high: 1474 MOVQ -8(DI)(BX*1), DI 1475 SHRQ CX, DI 1476 di_finish: 1477 1478 SUBQ SI, DI 1479 SHLQ CX, DI 1480 equal: 1481 SETEQ (AX) 1482 RET 1483 1484 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1485 MOVQ s1_base+0(FP), SI 1486 MOVQ s1_len+8(FP), BX 1487 MOVQ s2_base+16(FP), DI 1488 MOVQ s2_len+24(FP), DX 1489 LEAQ ret+32(FP), R9 1490 JMP runtime·cmpbody(SB) 1491 1492 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1493 MOVQ s1+0(FP), SI 1494 MOVQ s1+8(FP), BX 1495 MOVQ s2+24(FP), DI 1496 MOVQ s2+32(FP), DX 1497 LEAQ res+48(FP), R9 1498 JMP runtime·cmpbody(SB) 1499 1500 // input: 1501 // SI = a 1502 // DI = b 1503 // BX = alen 1504 // DX = blen 1505 // R9 = address of output word (stores -1/0/1 here) 1506 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1507 CMPQ SI, DI 1508 JEQ allsame 1509 CMPQ BX, DX 1510 MOVQ DX, R8 1511 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1512 CMPQ R8, $8 1513 JB small 1514 1515 CMPQ R8, $63 1516 JBE loop 1517 CMPB runtime·support_avx2(SB), $1 1518 JEQ big_loop_avx2 1519 JMP big_loop 1520 loop: 1521 CMPQ R8, $16 1522 JBE _0through16 1523 MOVOU (SI), X0 1524 MOVOU (DI), X1 1525 PCMPEQB X0, X1 1526 PMOVMSKB X1, AX 1527 XORQ $0xffff, AX // convert EQ to NE 1528 JNE diff16 // branch if at least one byte is not equal 1529 ADDQ $16, SI 1530 ADDQ $16, DI 1531 SUBQ $16, R8 1532 JMP loop 1533 1534 diff64: 1535 ADDQ $48, SI 1536 ADDQ $48, DI 1537 JMP diff16 1538 diff48: 1539 ADDQ $32, SI 1540 ADDQ $32, DI 1541 JMP diff16 1542 diff32: 1543 ADDQ $16, SI 1544 ADDQ $16, DI 1545 // AX = bit mask of differences 1546 diff16: 1547 BSFQ AX, BX // index of first byte that differs 1548 XORQ AX, AX 1549 MOVB (SI)(BX*1), CX 1550 CMPB CX, (DI)(BX*1) 1551 SETHI AX 1552 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1553 MOVQ AX, (R9) 1554 RET 1555 1556 // 0 through 16 bytes left, alen>=8, blen>=8 1557 _0through16: 1558 CMPQ R8, $8 1559 JBE _0through8 1560 MOVQ (SI), AX 1561 MOVQ (DI), CX 1562 CMPQ AX, CX 1563 JNE diff8 1564 _0through8: 1565 MOVQ -8(SI)(R8*1), AX 1566 MOVQ -8(DI)(R8*1), CX 1567 CMPQ AX, CX 1568 JEQ allsame 1569 1570 // AX and CX contain parts of a and b that differ. 1571 diff8: 1572 BSWAPQ AX // reverse order of bytes 1573 BSWAPQ CX 1574 XORQ AX, CX 1575 BSRQ CX, CX // index of highest bit difference 1576 SHRQ CX, AX // move a's bit to bottom 1577 ANDQ $1, AX // mask bit 1578 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1579 MOVQ AX, (R9) 1580 RET 1581 1582 // 0-7 bytes in common 1583 small: 1584 LEAQ (R8*8), CX // bytes left -> bits left 1585 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1586 JEQ allsame 1587 1588 // load bytes of a into high bytes of AX 1589 CMPB SI, $0xf8 1590 JA si_high 1591 MOVQ (SI), SI 1592 JMP si_finish 1593 si_high: 1594 MOVQ -8(SI)(R8*1), SI 1595 SHRQ CX, SI 1596 si_finish: 1597 SHLQ CX, SI 1598 1599 // load bytes of b in to high bytes of BX 1600 CMPB DI, $0xf8 1601 JA di_high 1602 MOVQ (DI), DI 1603 JMP di_finish 1604 di_high: 1605 MOVQ -8(DI)(R8*1), DI 1606 SHRQ CX, DI 1607 di_finish: 1608 SHLQ CX, DI 1609 1610 BSWAPQ SI // reverse order of bytes 1611 BSWAPQ DI 1612 XORQ SI, DI // find bit differences 1613 JEQ allsame 1614 BSRQ DI, CX // index of highest bit difference 1615 SHRQ CX, SI // move a's bit to bottom 1616 ANDQ $1, SI // mask bit 1617 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1618 MOVQ AX, (R9) 1619 RET 1620 1621 allsame: 1622 XORQ AX, AX 1623 XORQ CX, CX 1624 CMPQ BX, DX 1625 SETGT AX // 1 if alen > blen 1626 SETEQ CX // 1 if alen == blen 1627 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1628 MOVQ AX, (R9) 1629 RET 1630 1631 // this works for >= 64 bytes of data. 1632 big_loop: 1633 MOVOU (SI), X0 1634 MOVOU (DI), X1 1635 PCMPEQB X0, X1 1636 PMOVMSKB X1, AX 1637 XORQ $0xffff, AX 1638 JNE diff16 1639 1640 MOVOU 16(SI), X0 1641 MOVOU 16(DI), X1 1642 PCMPEQB X0, X1 1643 PMOVMSKB X1, AX 1644 XORQ $0xffff, AX 1645 JNE diff32 1646 1647 MOVOU 32(SI), X0 1648 MOVOU 32(DI), X1 1649 PCMPEQB X0, X1 1650 PMOVMSKB X1, AX 1651 XORQ $0xffff, AX 1652 JNE diff48 1653 1654 MOVOU 48(SI), X0 1655 MOVOU 48(DI), X1 1656 PCMPEQB X0, X1 1657 PMOVMSKB X1, AX 1658 XORQ $0xffff, AX 1659 JNE diff64 1660 1661 ADDQ $64, SI 1662 ADDQ $64, DI 1663 SUBQ $64, R8 1664 CMPQ R8, $64 1665 JBE loop 1666 JMP big_loop 1667 1668 // Compare 64-bytes per loop iteration. 1669 // Loop is unrolled and uses AVX2. 1670 big_loop_avx2: 1671 VMOVDQU (SI), Y2 1672 VMOVDQU (DI), Y3 1673 VMOVDQU 32(SI), Y4 1674 VMOVDQU 32(DI), Y5 1675 VPCMPEQB Y2, Y3, Y0 1676 VPMOVMSKB Y0, AX 1677 XORL $0xffffffff, AX 1678 JNE diff32_avx2 1679 VPCMPEQB Y4, Y5, Y6 1680 VPMOVMSKB Y6, AX 1681 XORL $0xffffffff, AX 1682 JNE diff64_avx2 1683 1684 ADDQ $64, SI 1685 ADDQ $64, DI 1686 SUBQ $64, R8 1687 CMPQ R8, $64 1688 JB big_loop_avx2_exit 1689 JMP big_loop_avx2 1690 1691 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 1692 diff32_avx2: 1693 VZEROUPPER 1694 JMP diff16 1695 1696 // Same as diff32_avx2, but for last 32 bytes. 1697 diff64_avx2: 1698 VZEROUPPER 1699 JMP diff48 1700 1701 // For <64 bytes remainder jump to normal loop. 1702 big_loop_avx2_exit: 1703 VZEROUPPER 1704 JMP loop 1705 1706 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 1707 MOVQ s+0(FP), DI 1708 // We want len in DX and AX, because PCMPESTRI implicitly consumes them 1709 MOVQ s_len+8(FP), DX 1710 MOVQ c+16(FP), BP 1711 MOVQ c_len+24(FP), AX 1712 MOVQ DI, R10 1713 LEAQ ret+32(FP), R11 1714 JMP runtime·indexShortStr(SB) 1715 1716 TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56 1717 MOVQ s+0(FP), DI 1718 MOVQ s_len+8(FP), DX 1719 MOVQ c+24(FP), BP 1720 MOVQ c_len+32(FP), AX 1721 MOVQ DI, R10 1722 LEAQ ret+48(FP), R11 1723 JMP runtime·indexShortStr(SB) 1724 1725 // AX: length of string, that we are searching for 1726 // DX: length of string, in which we are searching 1727 // DI: pointer to string, in which we are searching 1728 // BP: pointer to string, that we are searching for 1729 // R11: address, where to put return value 1730 TEXT runtime·indexShortStr(SB),NOSPLIT,$0 1731 CMPQ AX, DX 1732 JA fail 1733 CMPQ DX, $16 1734 JAE sse42 1735 no_sse42: 1736 CMPQ AX, $2 1737 JA _3_or_more 1738 MOVW (BP), BP 1739 LEAQ -1(DI)(DX*1), DX 1740 loop2: 1741 MOVW (DI), SI 1742 CMPW SI,BP 1743 JZ success 1744 ADDQ $1,DI 1745 CMPQ DI,DX 1746 JB loop2 1747 JMP fail 1748 _3_or_more: 1749 CMPQ AX, $3 1750 JA _4_or_more 1751 MOVW 1(BP), BX 1752 MOVW (BP), BP 1753 LEAQ -2(DI)(DX*1), DX 1754 loop3: 1755 MOVW (DI), SI 1756 CMPW SI,BP 1757 JZ partial_success3 1758 ADDQ $1,DI 1759 CMPQ DI,DX 1760 JB loop3 1761 JMP fail 1762 partial_success3: 1763 MOVW 1(DI), SI 1764 CMPW SI,BX 1765 JZ success 1766 ADDQ $1,DI 1767 CMPQ DI,DX 1768 JB loop3 1769 JMP fail 1770 _4_or_more: 1771 CMPQ AX, $4 1772 JA _5_or_more 1773 MOVL (BP), BP 1774 LEAQ -3(DI)(DX*1), DX 1775 loop4: 1776 MOVL (DI), SI 1777 CMPL SI,BP 1778 JZ success 1779 ADDQ $1,DI 1780 CMPQ DI,DX 1781 JB loop4 1782 JMP fail 1783 _5_or_more: 1784 CMPQ AX, $7 1785 JA _8_or_more 1786 LEAQ 1(DI)(DX*1), DX 1787 SUBQ AX, DX 1788 MOVL -4(BP)(AX*1), BX 1789 MOVL (BP), BP 1790 loop5to7: 1791 MOVL (DI), SI 1792 CMPL SI,BP 1793 JZ partial_success5to7 1794 ADDQ $1,DI 1795 CMPQ DI,DX 1796 JB loop5to7 1797 JMP fail 1798 partial_success5to7: 1799 MOVL -4(AX)(DI*1), SI 1800 CMPL SI,BX 1801 JZ success 1802 ADDQ $1,DI 1803 CMPQ DI,DX 1804 JB loop5to7 1805 JMP fail 1806 _8_or_more: 1807 CMPQ AX, $8 1808 JA _9_or_more 1809 MOVQ (BP), BP 1810 LEAQ -7(DI)(DX*1), DX 1811 loop8: 1812 MOVQ (DI), SI 1813 CMPQ SI,BP 1814 JZ success 1815 ADDQ $1,DI 1816 CMPQ DI,DX 1817 JB loop8 1818 JMP fail 1819 _9_or_more: 1820 CMPQ AX, $15 1821 JA _16_or_more 1822 LEAQ 1(DI)(DX*1), DX 1823 SUBQ AX, DX 1824 MOVQ -8(BP)(AX*1), BX 1825 MOVQ (BP), BP 1826 loop9to15: 1827 MOVQ (DI), SI 1828 CMPQ SI,BP 1829 JZ partial_success9to15 1830 ADDQ $1,DI 1831 CMPQ DI,DX 1832 JB loop9to15 1833 JMP fail 1834 partial_success9to15: 1835 MOVQ -8(AX)(DI*1), SI 1836 CMPQ SI,BX 1837 JZ success 1838 ADDQ $1,DI 1839 CMPQ DI,DX 1840 JB loop9to15 1841 JMP fail 1842 _16_or_more: 1843 CMPQ AX, $16 1844 JA _17_or_more 1845 MOVOU (BP), X1 1846 LEAQ -15(DI)(DX*1), DX 1847 loop16: 1848 MOVOU (DI), X2 1849 PCMPEQB X1, X2 1850 PMOVMSKB X2, SI 1851 CMPQ SI, $0xffff 1852 JE success 1853 ADDQ $1,DI 1854 CMPQ DI,DX 1855 JB loop16 1856 JMP fail 1857 _17_or_more: 1858 CMPQ AX, $31 1859 JA _32_or_more 1860 LEAQ 1(DI)(DX*1), DX 1861 SUBQ AX, DX 1862 MOVOU -16(BP)(AX*1), X0 1863 MOVOU (BP), X1 1864 loop17to31: 1865 MOVOU (DI), X2 1866 PCMPEQB X1,X2 1867 PMOVMSKB X2, SI 1868 CMPQ SI, $0xffff 1869 JE partial_success17to31 1870 ADDQ $1,DI 1871 CMPQ DI,DX 1872 JB loop17to31 1873 JMP fail 1874 partial_success17to31: 1875 MOVOU -16(AX)(DI*1), X3 1876 PCMPEQB X0, X3 1877 PMOVMSKB X3, SI 1878 CMPQ SI, $0xffff 1879 JE success 1880 ADDQ $1,DI 1881 CMPQ DI,DX 1882 JB loop17to31 1883 JMP fail 1884 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 1885 // So no need to check cpuid 1886 _32_or_more: 1887 CMPQ AX, $32 1888 JA _33_to_63 1889 VMOVDQU (BP), Y1 1890 LEAQ -31(DI)(DX*1), DX 1891 loop32: 1892 VMOVDQU (DI), Y2 1893 VPCMPEQB Y1, Y2, Y3 1894 VPMOVMSKB Y3, SI 1895 CMPL SI, $0xffffffff 1896 JE success_avx2 1897 ADDQ $1,DI 1898 CMPQ DI,DX 1899 JB loop32 1900 JMP fail_avx2 1901 _33_to_63: 1902 LEAQ 1(DI)(DX*1), DX 1903 SUBQ AX, DX 1904 VMOVDQU -32(BP)(AX*1), Y0 1905 VMOVDQU (BP), Y1 1906 loop33to63: 1907 VMOVDQU (DI), Y2 1908 VPCMPEQB Y1, Y2, Y3 1909 VPMOVMSKB Y3, SI 1910 CMPL SI, $0xffffffff 1911 JE partial_success33to63 1912 ADDQ $1,DI 1913 CMPQ DI,DX 1914 JB loop33to63 1915 JMP fail_avx2 1916 partial_success33to63: 1917 VMOVDQU -32(AX)(DI*1), Y3 1918 VPCMPEQB Y0, Y3, Y4 1919 VPMOVMSKB Y4, SI 1920 CMPL SI, $0xffffffff 1921 JE success_avx2 1922 ADDQ $1,DI 1923 CMPQ DI,DX 1924 JB loop33to63 1925 fail_avx2: 1926 VZEROUPPER 1927 fail: 1928 MOVQ $-1, (R11) 1929 RET 1930 success_avx2: 1931 VZEROUPPER 1932 JMP success 1933 sse42: 1934 CMPB runtime·support_sse42(SB), $1 1935 JNE no_sse42 1936 CMPQ AX, $12 1937 // PCMPESTRI is slower than normal compare, 1938 // so using it makes sense only if we advance 4+ bytes per compare 1939 // This value was determined experimentally and is the ~same 1940 // on Nehalem (first with SSE42) and Haswell. 1941 JAE _9_or_more 1942 LEAQ 16(BP), SI 1943 TESTW $0xff0, SI 1944 JEQ no_sse42 1945 MOVOU (BP), X1 1946 LEAQ -15(DI)(DX*1), SI 1947 MOVQ $16, R9 1948 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 1949 loop_sse42: 1950 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 1951 // for equality (bits 2,3 are 11) 1952 // result is not masked or inverted (bits 4,5 are 00) 1953 // and corresponds to first matching byte (bit 6 is 0) 1954 PCMPESTRI $0x0c, (DI), X1 1955 // CX == 16 means no match, 1956 // CX > R9 means partial match at the end of the string, 1957 // otherwise sep is at offset CX from X1 start 1958 CMPQ CX, R9 1959 JBE sse42_success 1960 ADDQ R9, DI 1961 CMPQ DI, SI 1962 JB loop_sse42 1963 PCMPESTRI $0x0c, -1(SI), X1 1964 CMPQ CX, R9 1965 JA fail 1966 LEAQ -1(SI), DI 1967 sse42_success: 1968 ADDQ CX, DI 1969 success: 1970 SUBQ R10, DI 1971 MOVQ DI, (R11) 1972 RET 1973 1974 1975 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 1976 MOVQ s+0(FP), SI 1977 MOVQ s_len+8(FP), BX 1978 MOVB c+24(FP), AL 1979 LEAQ ret+32(FP), R8 1980 JMP runtime·indexbytebody(SB) 1981 1982 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 1983 MOVQ s+0(FP), SI 1984 MOVQ s_len+8(FP), BX 1985 MOVB c+16(FP), AL 1986 LEAQ ret+24(FP), R8 1987 JMP runtime·indexbytebody(SB) 1988 1989 // input: 1990 // SI: data 1991 // BX: data len 1992 // AL: byte sought 1993 // R8: address to put result 1994 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 1995 // Shuffle X0 around so that each byte contains 1996 // the character we're looking for. 1997 MOVD AX, X0 1998 PUNPCKLBW X0, X0 1999 PUNPCKLBW X0, X0 2000 PSHUFL $0, X0, X0 2001 2002 CMPQ BX, $16 2003 JLT small 2004 2005 MOVQ SI, DI 2006 2007 CMPQ BX, $32 2008 JA avx2 2009 sse: 2010 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 2011 JMP sseloopentry 2012 2013 sseloop: 2014 // Move the next 16-byte chunk of the data into X1. 2015 MOVOU (DI), X1 2016 // Compare bytes in X0 to X1. 2017 PCMPEQB X0, X1 2018 // Take the top bit of each byte in X1 and put the result in DX. 2019 PMOVMSKB X1, DX 2020 // Find first set bit, if any. 2021 BSFL DX, DX 2022 JNZ ssesuccess 2023 // Advance to next block. 2024 ADDQ $16, DI 2025 sseloopentry: 2026 CMPQ DI, AX 2027 JB sseloop 2028 2029 // Search the last 16-byte chunk. This chunk may overlap with the 2030 // chunks we've already searched, but that's ok. 2031 MOVQ AX, DI 2032 MOVOU (AX), X1 2033 PCMPEQB X0, X1 2034 PMOVMSKB X1, DX 2035 BSFL DX, DX 2036 JNZ ssesuccess 2037 2038 failure: 2039 MOVQ $-1, (R8) 2040 RET 2041 2042 // We've found a chunk containing the byte. 2043 // The chunk was loaded from DI. 2044 // The index of the matching byte in the chunk is DX. 2045 // The start of the data is SI. 2046 ssesuccess: 2047 SUBQ SI, DI // Compute offset of chunk within data. 2048 ADDQ DX, DI // Add offset of byte within chunk. 2049 MOVQ DI, (R8) 2050 RET 2051 2052 // handle for lengths < 16 2053 small: 2054 TESTQ BX, BX 2055 JEQ failure 2056 2057 // Check if we'll load across a page boundary. 2058 LEAQ 16(SI), AX 2059 TESTW $0xff0, AX 2060 JEQ endofpage 2061 2062 MOVOU (SI), X1 // Load data 2063 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2064 PMOVMSKB X1, DX // Move result bits to integer register. 2065 BSFL DX, DX // Find first set bit. 2066 JZ failure // No set bit, failure. 2067 CMPL DX, BX 2068 JAE failure // Match is past end of data. 2069 MOVQ DX, (R8) 2070 RET 2071 2072 endofpage: 2073 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 2074 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2075 PMOVMSKB X1, DX // Move result bits to integer register. 2076 MOVL BX, CX 2077 SHLL CX, DX 2078 SHRL $16, DX // Shift desired bits down to bottom of register. 2079 BSFL DX, DX // Find first set bit. 2080 JZ failure // No set bit, failure. 2081 MOVQ DX, (R8) 2082 RET 2083 2084 avx2: 2085 CMPB runtime·support_avx2(SB), $1 2086 JNE sse 2087 MOVD AX, X0 2088 LEAQ -32(SI)(BX*1), R11 2089 VPBROADCASTB X0, Y1 2090 avx2_loop: 2091 VMOVDQU (DI), Y2 2092 VPCMPEQB Y1, Y2, Y3 2093 VPTEST Y3, Y3 2094 JNZ avx2success 2095 ADDQ $32, DI 2096 CMPQ DI, R11 2097 JLT avx2_loop 2098 MOVQ R11, DI 2099 VMOVDQU (DI), Y2 2100 VPCMPEQB Y1, Y2, Y3 2101 VPTEST Y3, Y3 2102 JNZ avx2success 2103 VZEROUPPER 2104 MOVQ $-1, (R8) 2105 RET 2106 2107 avx2success: 2108 VPMOVMSKB Y3, DX 2109 BSFL DX, DX 2110 SUBQ SI, DI 2111 ADDQ DI, DX 2112 MOVQ DX, (R8) 2113 VZEROUPPER 2114 RET 2115 2116 TEXT bytes·Equal(SB),NOSPLIT,$0-49 2117 MOVQ a_len+8(FP), BX 2118 MOVQ b_len+32(FP), CX 2119 CMPQ BX, CX 2120 JNE eqret 2121 MOVQ a+0(FP), SI 2122 MOVQ b+24(FP), DI 2123 LEAQ ret+48(FP), AX 2124 JMP runtime·memeqbody(SB) 2125 eqret: 2126 MOVB $0, ret+48(FP) 2127 RET 2128 2129 2130 TEXT bytes·countByte(SB),NOSPLIT,$0-40 2131 MOVQ s+0(FP), SI 2132 MOVQ s_len+8(FP), BX 2133 MOVB c+24(FP), AL 2134 LEAQ ret+32(FP), R8 2135 JMP runtime·countByte(SB) 2136 2137 TEXT strings·countByte(SB),NOSPLIT,$0-32 2138 MOVQ s+0(FP), SI 2139 MOVQ s_len+8(FP), BX 2140 MOVB c+16(FP), AL 2141 LEAQ ret+24(FP), R8 2142 JMP runtime·countByte(SB) 2143 2144 // input: 2145 // SI: data 2146 // BX: data len 2147 // AL: byte sought 2148 // R8: address to put result 2149 // This requires the POPCNT instruction 2150 TEXT runtime·countByte(SB),NOSPLIT,$0 2151 // Shuffle X0 around so that each byte contains 2152 // the character we're looking for. 2153 MOVD AX, X0 2154 PUNPCKLBW X0, X0 2155 PUNPCKLBW X0, X0 2156 PSHUFL $0, X0, X0 2157 2158 CMPQ BX, $16 2159 JLT small 2160 2161 MOVQ $0, R12 // Accumulator 2162 2163 MOVQ SI, DI 2164 2165 CMPQ BX, $32 2166 JA avx2 2167 sse: 2168 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 2169 JMP sseloopentry 2170 2171 sseloop: 2172 // Move the next 16-byte chunk of the data into X1. 2173 MOVOU (DI), X1 2174 // Compare bytes in X0 to X1. 2175 PCMPEQB X0, X1 2176 // Take the top bit of each byte in X1 and put the result in DX. 2177 PMOVMSKB X1, DX 2178 // Count number of matching bytes 2179 POPCNTL DX, DX 2180 // Accumulate into R12 2181 ADDQ DX, R12 2182 // Advance to next block. 2183 ADDQ $16, DI 2184 sseloopentry: 2185 CMPQ DI, AX 2186 JBE sseloop 2187 2188 // Get the number of bytes to consider in the last 16 bytes 2189 ANDQ $15, BX 2190 JZ end 2191 2192 // Create mask to ignore overlap between previous 16 byte block 2193 // and the next. 2194 MOVQ $16,CX 2195 SUBQ BX, CX 2196 MOVQ $0xFFFF, R10 2197 SARQ CL, R10 2198 SALQ CL, R10 2199 2200 // Process the last 16-byte chunk. This chunk may overlap with the 2201 // chunks we've already searched so we need to mask part of it. 2202 MOVOU (AX), X1 2203 PCMPEQB X0, X1 2204 PMOVMSKB X1, DX 2205 // Apply mask 2206 ANDQ R10, DX 2207 POPCNTL DX, DX 2208 ADDQ DX, R12 2209 end: 2210 MOVQ R12, (R8) 2211 RET 2212 2213 // handle for lengths < 16 2214 small: 2215 TESTQ BX, BX 2216 JEQ endzero 2217 2218 // Check if we'll load across a page boundary. 2219 LEAQ 16(SI), AX 2220 TESTW $0xff0, AX 2221 JEQ endofpage 2222 2223 // We must ignore high bytes as they aren't part of our slice. 2224 // Create mask. 2225 MOVB BX, CX 2226 MOVQ $1, R10 2227 SALQ CL, R10 2228 SUBQ $1, R10 2229 2230 // Load data 2231 MOVOU (SI), X1 2232 // Compare target byte with each byte in data. 2233 PCMPEQB X0, X1 2234 // Move result bits to integer register. 2235 PMOVMSKB X1, DX 2236 // Apply mask 2237 ANDQ R10, DX 2238 POPCNTL DX, DX 2239 // Directly return DX, we don't need to accumulate 2240 // since we have <16 bytes. 2241 MOVQ DX, (R8) 2242 RET 2243 endzero: 2244 MOVQ $0, (R8) 2245 RET 2246 2247 endofpage: 2248 // We must ignore low bytes as they aren't part of our slice. 2249 MOVQ $16,CX 2250 SUBQ BX, CX 2251 MOVQ $0xFFFF, R10 2252 SARQ CL, R10 2253 SALQ CL, R10 2254 2255 // Load data into the high end of X1. 2256 MOVOU -16(SI)(BX*1), X1 2257 // Compare target byte with each byte in data. 2258 PCMPEQB X0, X1 2259 // Move result bits to integer register. 2260 PMOVMSKB X1, DX 2261 // Apply mask 2262 ANDQ R10, DX 2263 // Directly return DX, we don't need to accumulate 2264 // since we have <16 bytes. 2265 POPCNTL DX, DX 2266 MOVQ DX, (R8) 2267 RET 2268 2269 avx2: 2270 CMPB runtime·support_avx2(SB), $1 2271 JNE sse 2272 MOVD AX, X0 2273 LEAQ -32(SI)(BX*1), R11 2274 VPBROADCASTB X0, Y1 2275 avx2_loop: 2276 VMOVDQU (DI), Y2 2277 VPCMPEQB Y1, Y2, Y3 2278 VPMOVMSKB Y3, DX 2279 POPCNTL DX, DX 2280 ADDQ DX, R12 2281 ADDQ $32, DI 2282 CMPQ DI, R11 2283 JLE avx2_loop 2284 2285 // If last block is already processed, 2286 // skip to the end. 2287 CMPQ DI, R11 2288 JEQ endavx 2289 2290 // Load address of the last 32 bytes. 2291 // There is an overlap with the previous block. 2292 MOVQ R11, DI 2293 VMOVDQU (DI), Y2 2294 VPCMPEQB Y1, Y2, Y3 2295 VPMOVMSKB Y3, DX 2296 // Exit AVX mode. 2297 VZEROUPPER 2298 2299 // Create mask to ignore overlap between previous 32 byte block 2300 // and the next. 2301 ANDQ $31, BX 2302 MOVQ $32,CX 2303 SUBQ BX, CX 2304 MOVQ $0xFFFFFFFF, R10 2305 SARQ CL, R10 2306 SALQ CL, R10 2307 // Apply mask 2308 ANDQ R10, DX 2309 POPCNTL DX, DX 2310 ADDQ DX, R12 2311 MOVQ R12, (R8) 2312 RET 2313 endavx: 2314 // Exit AVX mode. 2315 VZEROUPPER 2316 MOVQ R12, (R8) 2317 RET 2318 2319 TEXT runtime·return0(SB), NOSPLIT, $0 2320 MOVL $0, AX 2321 RET 2322 2323 2324 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 2325 // Must obey the gcc calling convention. 2326 TEXT _cgo_topofstack(SB),NOSPLIT,$0 2327 get_tls(CX) 2328 MOVQ g(CX), AX 2329 MOVQ g_m(AX), AX 2330 MOVQ m_curg(AX), AX 2331 MOVQ (g_stack+stack_hi)(AX), AX 2332 RET 2333 2334 // The top-most function running on a goroutine 2335 // returns to goexit+PCQuantum. 2336 TEXT runtime·goexit(SB),NOSPLIT,$0-0 2337 BYTE $0x90 // NOP 2338 CALL runtime·goexit1(SB) // does not return 2339 // traceback from goexit1 must hit code range of goexit 2340 BYTE $0x90 // NOP 2341 2342 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8 2343 MOVQ addr+0(FP), AX 2344 PREFETCHT0 (AX) 2345 RET 2346 2347 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8 2348 MOVQ addr+0(FP), AX 2349 PREFETCHT1 (AX) 2350 RET 2351 2352 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8 2353 MOVQ addr+0(FP), AX 2354 PREFETCHT2 (AX) 2355 RET 2356 2357 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8 2358 MOVQ addr+0(FP), AX 2359 PREFETCHNTA (AX) 2360 RET 2361 2362 // This is called from .init_array and follows the platform, not Go, ABI. 2363 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 2364 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 2365 MOVQ runtime·lastmoduledatap(SB), AX 2366 MOVQ DI, moduledata_next(AX) 2367 MOVQ DI, runtime·lastmoduledatap(SB) 2368 POPQ R15 2369 RET