github.com/dannin/go@v0.0.0-20161031215817-d35dfd405eaa/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVQ $0, AX 30 CPUID 31 MOVQ AX, SI 32 CMPQ AX, $0 33 JE nocpuinfo 34 35 // Figure out how to serialize RDTSC. 36 // On Intel processors LFENCE is enough. AMD requires MFENCE. 37 // Don't know about the rest, so let's do MFENCE. 38 CMPL BX, $0x756E6547 // "Genu" 39 JNE notintel 40 CMPL DX, $0x49656E69 // "ineI" 41 JNE notintel 42 CMPL CX, $0x6C65746E // "ntel" 43 JNE notintel 44 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 45 notintel: 46 47 // Load EAX=1 cpuid flags 48 MOVQ $1, AX 49 CPUID 50 MOVL CX, runtime·cpuid_ecx(SB) 51 MOVL DX, runtime·cpuid_edx(SB) 52 53 // Load EAX=7/ECX=0 cpuid flags 54 CMPQ SI, $7 55 JLT no7 56 MOVL $7, AX 57 MOVL $0, CX 58 CPUID 59 MOVL BX, runtime·cpuid_ebx7(SB) 60 no7: 61 // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1] 62 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 63 // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf 64 MOVL runtime·cpuid_ecx(SB), CX 65 ANDL $0x18000000, CX // check for OSXSAVE and AVX bits 66 CMPL CX, $0x18000000 67 JNE noavx 68 MOVL $0, CX 69 // For XGETBV, OSXSAVE bit is required and sufficient 70 XGETBV 71 ANDL $6, AX 72 CMPL AX, $6 // Check for OS support of YMM registers 73 JNE noavx 74 MOVB $1, runtime·support_avx(SB) 75 TESTL $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit 76 JEQ noavx2 77 MOVB $1, runtime·support_avx2(SB) 78 JMP nocpuinfo 79 noavx: 80 MOVB $0, runtime·support_avx(SB) 81 noavx2: 82 MOVB $0, runtime·support_avx2(SB) 83 nocpuinfo: 84 85 // if there is an _cgo_init, call it. 86 MOVQ _cgo_init(SB), AX 87 TESTQ AX, AX 88 JZ needtls 89 // g0 already in DI 90 MOVQ DI, CX // Win64 uses CX for first parameter 91 MOVQ $setg_gcc<>(SB), SI 92 CALL AX 93 94 // update stackguard after _cgo_init 95 MOVQ $runtime·g0(SB), CX 96 MOVQ (g_stack+stack_lo)(CX), AX 97 ADDQ $const__StackGuard, AX 98 MOVQ AX, g_stackguard0(CX) 99 MOVQ AX, g_stackguard1(CX) 100 101 #ifndef GOOS_windows 102 JMP ok 103 #endif 104 needtls: 105 #ifdef GOOS_plan9 106 // skip TLS setup on Plan 9 107 JMP ok 108 #endif 109 #ifdef GOOS_solaris 110 // skip TLS setup on Solaris 111 JMP ok 112 #endif 113 114 LEAQ runtime·m0+m_tls(SB), DI 115 CALL runtime·settls(SB) 116 117 // store through it, to make sure it works 118 get_tls(BX) 119 MOVQ $0x123, g(BX) 120 MOVQ runtime·m0+m_tls(SB), AX 121 CMPQ AX, $0x123 122 JEQ 2(PC) 123 MOVL AX, 0 // abort 124 ok: 125 // set the per-goroutine and per-mach "registers" 126 get_tls(BX) 127 LEAQ runtime·g0(SB), CX 128 MOVQ CX, g(BX) 129 LEAQ runtime·m0(SB), AX 130 131 // save m->g0 = g0 132 MOVQ CX, m_g0(AX) 133 // save m0 to g0->m 134 MOVQ AX, g_m(CX) 135 136 CLD // convention is D is always left cleared 137 CALL runtime·check(SB) 138 139 MOVL 16(SP), AX // copy argc 140 MOVL AX, 0(SP) 141 MOVQ 24(SP), AX // copy argv 142 MOVQ AX, 8(SP) 143 CALL runtime·args(SB) 144 CALL runtime·osinit(SB) 145 CALL runtime·schedinit(SB) 146 147 // create a new goroutine to start program 148 MOVQ $runtime·mainPC(SB), AX // entry 149 PUSHQ AX 150 PUSHQ $0 // arg size 151 CALL runtime·newproc(SB) 152 POPQ AX 153 POPQ AX 154 155 // start this M 156 CALL runtime·mstart(SB) 157 158 MOVL $0xf1, 0xf1 // crash 159 RET 160 161 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 162 GLOBL runtime·mainPC(SB),RODATA,$8 163 164 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 165 BYTE $0xcc 166 RET 167 168 TEXT runtime·asminit(SB),NOSPLIT,$0-0 169 // No per-thread init. 170 RET 171 172 /* 173 * go-routine 174 */ 175 176 // void gosave(Gobuf*) 177 // save state in Gobuf; setjmp 178 TEXT runtime·gosave(SB), NOSPLIT, $0-8 179 MOVQ buf+0(FP), AX // gobuf 180 LEAQ buf+0(FP), BX // caller's SP 181 MOVQ BX, gobuf_sp(AX) 182 MOVQ 0(SP), BX // caller's PC 183 MOVQ BX, gobuf_pc(AX) 184 MOVQ $0, gobuf_ret(AX) 185 MOVQ BP, gobuf_bp(AX) 186 // Assert ctxt is zero. See func save. 187 MOVQ gobuf_ctxt(AX), BX 188 TESTQ BX, BX 189 JZ 2(PC) 190 CALL runtime·badctxt(SB) 191 get_tls(CX) 192 MOVQ g(CX), BX 193 MOVQ BX, gobuf_g(AX) 194 RET 195 196 // void gogo(Gobuf*) 197 // restore state from Gobuf; longjmp 198 TEXT runtime·gogo(SB), NOSPLIT, $16-8 199 MOVQ buf+0(FP), BX // gobuf 200 201 // If ctxt is not nil, invoke deletion barrier before overwriting. 202 MOVQ gobuf_ctxt(BX), AX 203 TESTQ AX, AX 204 JZ nilctxt 205 LEAQ gobuf_ctxt(BX), AX 206 MOVQ AX, 0(SP) 207 MOVQ $0, 8(SP) 208 CALL runtime·writebarrierptr_prewrite(SB) 209 MOVQ buf+0(FP), BX 210 211 nilctxt: 212 MOVQ gobuf_g(BX), DX 213 MOVQ 0(DX), CX // make sure g != nil 214 get_tls(CX) 215 MOVQ DX, g(CX) 216 MOVQ gobuf_sp(BX), SP // restore SP 217 MOVQ gobuf_ret(BX), AX 218 MOVQ gobuf_ctxt(BX), DX 219 MOVQ gobuf_bp(BX), BP 220 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 221 MOVQ $0, gobuf_ret(BX) 222 MOVQ $0, gobuf_ctxt(BX) 223 MOVQ $0, gobuf_bp(BX) 224 MOVQ gobuf_pc(BX), BX 225 JMP BX 226 227 // func mcall(fn func(*g)) 228 // Switch to m->g0's stack, call fn(g). 229 // Fn must never return. It should gogo(&g->sched) 230 // to keep running g. 231 TEXT runtime·mcall(SB), NOSPLIT, $0-8 232 MOVQ fn+0(FP), DI 233 234 get_tls(CX) 235 MOVQ g(CX), AX // save state in g->sched 236 MOVQ 0(SP), BX // caller's PC 237 MOVQ BX, (g_sched+gobuf_pc)(AX) 238 LEAQ fn+0(FP), BX // caller's SP 239 MOVQ BX, (g_sched+gobuf_sp)(AX) 240 MOVQ AX, (g_sched+gobuf_g)(AX) 241 MOVQ BP, (g_sched+gobuf_bp)(AX) 242 243 // switch to m->g0 & its stack, call fn 244 MOVQ g(CX), BX 245 MOVQ g_m(BX), BX 246 MOVQ m_g0(BX), SI 247 CMPQ SI, AX // if g == m->g0 call badmcall 248 JNE 3(PC) 249 MOVQ $runtime·badmcall(SB), AX 250 JMP AX 251 MOVQ SI, g(CX) // g = m->g0 252 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 253 PUSHQ AX 254 MOVQ DI, DX 255 MOVQ 0(DI), DI 256 CALL DI 257 POPQ AX 258 MOVQ $runtime·badmcall2(SB), AX 259 JMP AX 260 RET 261 262 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 263 // of the G stack. We need to distinguish the routine that 264 // lives at the bottom of the G stack from the one that lives 265 // at the top of the system stack because the one at the top of 266 // the system stack terminates the stack walk (see topofstack()). 267 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 268 RET 269 270 // func systemstack(fn func()) 271 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 272 MOVQ fn+0(FP), DI // DI = fn 273 get_tls(CX) 274 MOVQ g(CX), AX // AX = g 275 MOVQ g_m(AX), BX // BX = m 276 277 MOVQ m_gsignal(BX), DX // DX = gsignal 278 CMPQ AX, DX 279 JEQ noswitch 280 281 MOVQ m_g0(BX), DX // DX = g0 282 CMPQ AX, DX 283 JEQ noswitch 284 285 MOVQ m_curg(BX), R8 286 CMPQ AX, R8 287 JEQ switch 288 289 // Bad: g is not gsignal, not g0, not curg. What is it? 290 MOVQ $runtime·badsystemstack(SB), AX 291 CALL AX 292 293 switch: 294 // save our state in g->sched. Pretend to 295 // be systemstack_switch if the G stack is scanned. 296 MOVQ $runtime·systemstack_switch(SB), SI 297 MOVQ SI, (g_sched+gobuf_pc)(AX) 298 MOVQ SP, (g_sched+gobuf_sp)(AX) 299 MOVQ AX, (g_sched+gobuf_g)(AX) 300 MOVQ BP, (g_sched+gobuf_bp)(AX) 301 302 // switch to g0 303 MOVQ DX, g(CX) 304 MOVQ (g_sched+gobuf_sp)(DX), BX 305 // make it look like mstart called systemstack on g0, to stop traceback 306 SUBQ $8, BX 307 MOVQ $runtime·mstart(SB), DX 308 MOVQ DX, 0(BX) 309 MOVQ BX, SP 310 311 // call target function 312 MOVQ DI, DX 313 MOVQ 0(DI), DI 314 CALL DI 315 316 // switch back to g 317 get_tls(CX) 318 MOVQ g(CX), AX 319 MOVQ g_m(AX), BX 320 MOVQ m_curg(BX), AX 321 MOVQ AX, g(CX) 322 MOVQ (g_sched+gobuf_sp)(AX), SP 323 MOVQ $0, (g_sched+gobuf_sp)(AX) 324 RET 325 326 noswitch: 327 // already on m stack, just call directly 328 MOVQ DI, DX 329 MOVQ 0(DI), DI 330 CALL DI 331 RET 332 333 /* 334 * support for morestack 335 */ 336 337 // Called during function prolog when more stack is needed. 338 // 339 // The traceback routines see morestack on a g0 as being 340 // the top of a stack (for example, morestack calling newstack 341 // calling the scheduler calling newm calling gc), so we must 342 // record an argument size. For that purpose, it has no arguments. 343 TEXT runtime·morestack(SB),NOSPLIT,$0-0 344 // Cannot grow scheduler stack (m->g0). 345 get_tls(CX) 346 MOVQ g(CX), BX 347 MOVQ g_m(BX), BX 348 MOVQ m_g0(BX), SI 349 CMPQ g(CX), SI 350 JNE 3(PC) 351 CALL runtime·badmorestackg0(SB) 352 INT $3 353 354 // Cannot grow signal stack (m->gsignal). 355 MOVQ m_gsignal(BX), SI 356 CMPQ g(CX), SI 357 JNE 3(PC) 358 CALL runtime·badmorestackgsignal(SB) 359 INT $3 360 361 // Called from f. 362 // Set m->morebuf to f's caller. 363 MOVQ 8(SP), AX // f's caller's PC 364 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 365 LEAQ 16(SP), AX // f's caller's SP 366 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 367 get_tls(CX) 368 MOVQ g(CX), SI 369 MOVQ SI, (m_morebuf+gobuf_g)(BX) 370 371 // Set g->sched to context in f. 372 MOVQ 0(SP), AX // f's PC 373 MOVQ AX, (g_sched+gobuf_pc)(SI) 374 MOVQ SI, (g_sched+gobuf_g)(SI) 375 LEAQ 8(SP), AX // f's SP 376 MOVQ AX, (g_sched+gobuf_sp)(SI) 377 MOVQ BP, (g_sched+gobuf_bp)(SI) 378 // newstack will fill gobuf.ctxt. 379 380 // Call newstack on m->g0's stack. 381 MOVQ m_g0(BX), BX 382 MOVQ BX, g(CX) 383 MOVQ (g_sched+gobuf_sp)(BX), SP 384 PUSHQ DX // ctxt argument 385 CALL runtime·newstack(SB) 386 MOVQ $0, 0x1003 // crash if newstack returns 387 POPQ DX // keep balance check happy 388 RET 389 390 // morestack but not preserving ctxt. 391 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 392 MOVL $0, DX 393 JMP runtime·morestack(SB) 394 395 TEXT runtime·stackBarrier(SB),NOSPLIT,$0 396 // We came here via a RET to an overwritten return PC. 397 // AX may be live. Other registers are available. 398 399 // Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal. 400 get_tls(CX) 401 MOVQ g(CX), CX 402 MOVQ (g_stkbar+slice_array)(CX), DX 403 MOVQ g_stkbarPos(CX), BX 404 IMULQ $stkbar__size, BX // Too big for SIB. 405 MOVQ stkbar_savedLRPtr(DX)(BX*1), R8 406 MOVQ stkbar_savedLRVal(DX)(BX*1), BX 407 // Assert that we're popping the right saved LR. 408 ADDQ $8, R8 409 CMPQ R8, SP 410 JEQ 2(PC) 411 MOVL $0, 0 412 // Record that this stack barrier was hit. 413 ADDQ $1, g_stkbarPos(CX) 414 // Jump to the original return PC. 415 JMP BX 416 417 // reflectcall: call a function with the given argument list 418 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 419 // we don't have variable-sized frames, so we use a small number 420 // of constant-sized-frame functions to encode a few bits of size in the pc. 421 // Caution: ugly multiline assembly macros in your future! 422 423 #define DISPATCH(NAME,MAXSIZE) \ 424 CMPQ CX, $MAXSIZE; \ 425 JA 3(PC); \ 426 MOVQ $NAME(SB), AX; \ 427 JMP AX 428 // Note: can't just "JMP NAME(SB)" - bad inlining results. 429 430 TEXT reflect·call(SB), NOSPLIT, $0-0 431 JMP ·reflectcall(SB) 432 433 TEXT ·reflectcall(SB), NOSPLIT, $0-32 434 MOVLQZX argsize+24(FP), CX 435 DISPATCH(runtime·call32, 32) 436 DISPATCH(runtime·call64, 64) 437 DISPATCH(runtime·call128, 128) 438 DISPATCH(runtime·call256, 256) 439 DISPATCH(runtime·call512, 512) 440 DISPATCH(runtime·call1024, 1024) 441 DISPATCH(runtime·call2048, 2048) 442 DISPATCH(runtime·call4096, 4096) 443 DISPATCH(runtime·call8192, 8192) 444 DISPATCH(runtime·call16384, 16384) 445 DISPATCH(runtime·call32768, 32768) 446 DISPATCH(runtime·call65536, 65536) 447 DISPATCH(runtime·call131072, 131072) 448 DISPATCH(runtime·call262144, 262144) 449 DISPATCH(runtime·call524288, 524288) 450 DISPATCH(runtime·call1048576, 1048576) 451 DISPATCH(runtime·call2097152, 2097152) 452 DISPATCH(runtime·call4194304, 4194304) 453 DISPATCH(runtime·call8388608, 8388608) 454 DISPATCH(runtime·call16777216, 16777216) 455 DISPATCH(runtime·call33554432, 33554432) 456 DISPATCH(runtime·call67108864, 67108864) 457 DISPATCH(runtime·call134217728, 134217728) 458 DISPATCH(runtime·call268435456, 268435456) 459 DISPATCH(runtime·call536870912, 536870912) 460 DISPATCH(runtime·call1073741824, 1073741824) 461 MOVQ $runtime·badreflectcall(SB), AX 462 JMP AX 463 464 #define CALLFN(NAME,MAXSIZE) \ 465 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 466 NO_LOCAL_POINTERS; \ 467 /* copy arguments to stack */ \ 468 MOVQ argptr+16(FP), SI; \ 469 MOVLQZX argsize+24(FP), CX; \ 470 MOVQ SP, DI; \ 471 REP;MOVSB; \ 472 /* call function */ \ 473 MOVQ f+8(FP), DX; \ 474 PCDATA $PCDATA_StackMapIndex, $0; \ 475 CALL (DX); \ 476 /* copy return values back */ \ 477 MOVQ argtype+0(FP), DX; \ 478 MOVQ argptr+16(FP), DI; \ 479 MOVLQZX argsize+24(FP), CX; \ 480 MOVLQZX retoffset+28(FP), BX; \ 481 MOVQ SP, SI; \ 482 ADDQ BX, DI; \ 483 ADDQ BX, SI; \ 484 SUBQ BX, CX; \ 485 CALL callRet<>(SB); \ 486 RET 487 488 // callRet copies return values back at the end of call*. This is a 489 // separate function so it can allocate stack space for the arguments 490 // to reflectcallmove. It does not follow the Go ABI; it expects its 491 // arguments in registers. 492 TEXT callRet<>(SB), NOSPLIT, $32-0 493 NO_LOCAL_POINTERS 494 MOVQ DX, 0(SP) 495 MOVQ DI, 8(SP) 496 MOVQ SI, 16(SP) 497 MOVQ CX, 24(SP) 498 CALL runtime·reflectcallmove(SB) 499 RET 500 501 CALLFN(·call32, 32) 502 CALLFN(·call64, 64) 503 CALLFN(·call128, 128) 504 CALLFN(·call256, 256) 505 CALLFN(·call512, 512) 506 CALLFN(·call1024, 1024) 507 CALLFN(·call2048, 2048) 508 CALLFN(·call4096, 4096) 509 CALLFN(·call8192, 8192) 510 CALLFN(·call16384, 16384) 511 CALLFN(·call32768, 32768) 512 CALLFN(·call65536, 65536) 513 CALLFN(·call131072, 131072) 514 CALLFN(·call262144, 262144) 515 CALLFN(·call524288, 524288) 516 CALLFN(·call1048576, 1048576) 517 CALLFN(·call2097152, 2097152) 518 CALLFN(·call4194304, 4194304) 519 CALLFN(·call8388608, 8388608) 520 CALLFN(·call16777216, 16777216) 521 CALLFN(·call33554432, 33554432) 522 CALLFN(·call67108864, 67108864) 523 CALLFN(·call134217728, 134217728) 524 CALLFN(·call268435456, 268435456) 525 CALLFN(·call536870912, 536870912) 526 CALLFN(·call1073741824, 1073741824) 527 528 TEXT runtime·procyield(SB),NOSPLIT,$0-0 529 MOVL cycles+0(FP), AX 530 again: 531 PAUSE 532 SUBL $1, AX 533 JNZ again 534 RET 535 536 537 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 538 // Stores are already ordered on x86, so this is just a 539 // compile barrier. 540 RET 541 542 // void jmpdefer(fn, sp); 543 // called from deferreturn. 544 // 1. pop the caller 545 // 2. sub 5 bytes from the callers return 546 // 3. jmp to the argument 547 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 548 MOVQ fv+0(FP), DX // fn 549 MOVQ argp+8(FP), BX // caller sp 550 LEAQ -8(BX), SP // caller sp after CALL 551 MOVQ -8(SP), BP // restore BP as if deferreturn returned (harmless if framepointers not in use) 552 SUBQ $5, (SP) // return to CALL again 553 MOVQ 0(DX), BX 554 JMP BX // but first run the deferred function 555 556 // Save state of caller into g->sched. Smashes R8, R9. 557 TEXT gosave<>(SB),NOSPLIT,$0 558 get_tls(R8) 559 MOVQ g(R8), R8 560 MOVQ 0(SP), R9 561 MOVQ R9, (g_sched+gobuf_pc)(R8) 562 LEAQ 8(SP), R9 563 MOVQ R9, (g_sched+gobuf_sp)(R8) 564 MOVQ $0, (g_sched+gobuf_ret)(R8) 565 MOVQ BP, (g_sched+gobuf_bp)(R8) 566 // Assert ctxt is zero. See func save. 567 MOVQ (g_sched+gobuf_ctxt)(R8), R9 568 TESTQ R9, R9 569 JZ 2(PC) 570 CALL runtime·badctxt(SB) 571 RET 572 573 // func asmcgocall(fn, arg unsafe.Pointer) int32 574 // Call fn(arg) on the scheduler stack, 575 // aligned appropriately for the gcc ABI. 576 // See cgocall.go for more details. 577 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 578 MOVQ fn+0(FP), AX 579 MOVQ arg+8(FP), BX 580 581 MOVQ SP, DX 582 583 // Figure out if we need to switch to m->g0 stack. 584 // We get called to create new OS threads too, and those 585 // come in on the m->g0 stack already. 586 get_tls(CX) 587 MOVQ g(CX), R8 588 CMPQ R8, $0 589 JEQ nosave 590 MOVQ g_m(R8), R8 591 MOVQ m_g0(R8), SI 592 MOVQ g(CX), DI 593 CMPQ SI, DI 594 JEQ nosave 595 MOVQ m_gsignal(R8), SI 596 CMPQ SI, DI 597 JEQ nosave 598 599 // Switch to system stack. 600 MOVQ m_g0(R8), SI 601 CALL gosave<>(SB) 602 MOVQ SI, g(CX) 603 MOVQ (g_sched+gobuf_sp)(SI), SP 604 605 // Now on a scheduling stack (a pthread-created stack). 606 // Make sure we have enough room for 4 stack-backed fast-call 607 // registers as per windows amd64 calling convention. 608 SUBQ $64, SP 609 ANDQ $~15, SP // alignment for gcc ABI 610 MOVQ DI, 48(SP) // save g 611 MOVQ (g_stack+stack_hi)(DI), DI 612 SUBQ DX, DI 613 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 614 MOVQ BX, DI // DI = first argument in AMD64 ABI 615 MOVQ BX, CX // CX = first argument in Win64 616 CALL AX 617 618 // Restore registers, g, stack pointer. 619 get_tls(CX) 620 MOVQ 48(SP), DI 621 MOVQ (g_stack+stack_hi)(DI), SI 622 SUBQ 40(SP), SI 623 MOVQ DI, g(CX) 624 MOVQ SI, SP 625 626 MOVL AX, ret+16(FP) 627 RET 628 629 nosave: 630 // Running on a system stack, perhaps even without a g. 631 // Having no g can happen during thread creation or thread teardown 632 // (see needm/dropm on Solaris, for example). 633 // This code is like the above sequence but without saving/restoring g 634 // and without worrying about the stack moving out from under us 635 // (because we're on a system stack, not a goroutine stack). 636 // The above code could be used directly if already on a system stack, 637 // but then the only path through this code would be a rare case on Solaris. 638 // Using this code for all "already on system stack" calls exercises it more, 639 // which should help keep it correct. 640 SUBQ $64, SP 641 ANDQ $~15, SP 642 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging 643 MOVQ DX, 40(SP) // save original stack pointer 644 MOVQ BX, DI // DI = first argument in AMD64 ABI 645 MOVQ BX, CX // CX = first argument in Win64 646 CALL AX 647 MOVQ 40(SP), SI // restore original stack pointer 648 MOVQ SI, SP 649 MOVL AX, ret+16(FP) 650 RET 651 652 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt) 653 // Turn the fn into a Go func (by taking its address) and call 654 // cgocallback_gofunc. 655 TEXT runtime·cgocallback(SB),NOSPLIT,$32-32 656 LEAQ fn+0(FP), AX 657 MOVQ AX, 0(SP) 658 MOVQ frame+8(FP), AX 659 MOVQ AX, 8(SP) 660 MOVQ framesize+16(FP), AX 661 MOVQ AX, 16(SP) 662 MOVQ ctxt+24(FP), AX 663 MOVQ AX, 24(SP) 664 MOVQ $runtime·cgocallback_gofunc(SB), AX 665 CALL AX 666 RET 667 668 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt) 669 // See cgocall.go for more details. 670 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32 671 NO_LOCAL_POINTERS 672 673 // If g is nil, Go did not create the current thread. 674 // Call needm to obtain one m for temporary use. 675 // In this case, we're running on the thread stack, so there's 676 // lots of space, but the linker doesn't know. Hide the call from 677 // the linker analysis by using an indirect call through AX. 678 get_tls(CX) 679 #ifdef GOOS_windows 680 MOVL $0, BX 681 CMPQ CX, $0 682 JEQ 2(PC) 683 #endif 684 MOVQ g(CX), BX 685 CMPQ BX, $0 686 JEQ needm 687 MOVQ g_m(BX), BX 688 MOVQ BX, R8 // holds oldm until end of function 689 JMP havem 690 needm: 691 MOVQ $0, 0(SP) 692 MOVQ $runtime·needm(SB), AX 693 CALL AX 694 MOVQ 0(SP), R8 695 get_tls(CX) 696 MOVQ g(CX), BX 697 MOVQ g_m(BX), BX 698 699 // Set m->sched.sp = SP, so that if a panic happens 700 // during the function we are about to execute, it will 701 // have a valid SP to run on the g0 stack. 702 // The next few lines (after the havem label) 703 // will save this SP onto the stack and then write 704 // the same SP back to m->sched.sp. That seems redundant, 705 // but if an unrecovered panic happens, unwindm will 706 // restore the g->sched.sp from the stack location 707 // and then systemstack will try to use it. If we don't set it here, 708 // that restored SP will be uninitialized (typically 0) and 709 // will not be usable. 710 MOVQ m_g0(BX), SI 711 MOVQ SP, (g_sched+gobuf_sp)(SI) 712 713 havem: 714 // Now there's a valid m, and we're running on its m->g0. 715 // Save current m->g0->sched.sp on stack and then set it to SP. 716 // Save current sp in m->g0->sched.sp in preparation for 717 // switch back to m->curg stack. 718 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 719 MOVQ m_g0(BX), SI 720 MOVQ (g_sched+gobuf_sp)(SI), AX 721 MOVQ AX, 0(SP) 722 MOVQ SP, (g_sched+gobuf_sp)(SI) 723 724 // Switch to m->curg stack and call runtime.cgocallbackg. 725 // Because we are taking over the execution of m->curg 726 // but *not* resuming what had been running, we need to 727 // save that information (m->curg->sched) so we can restore it. 728 // We can restore m->curg->sched.sp easily, because calling 729 // runtime.cgocallbackg leaves SP unchanged upon return. 730 // To save m->curg->sched.pc, we push it onto the stack. 731 // This has the added benefit that it looks to the traceback 732 // routine like cgocallbackg is going to return to that 733 // PC (because the frame we allocate below has the same 734 // size as cgocallback_gofunc's frame declared above) 735 // so that the traceback will seamlessly trace back into 736 // the earlier calls. 737 // 738 // In the new goroutine, 8(SP) holds the saved R8. 739 MOVQ m_curg(BX), SI 740 MOVQ SI, g(CX) 741 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 742 MOVQ (g_sched+gobuf_pc)(SI), BX 743 MOVQ BX, -8(DI) 744 // Compute the size of the frame, including return PC and, if 745 // GOEXPERIMENT=framepointer, the saved based pointer 746 MOVQ ctxt+24(FP), BX 747 LEAQ fv+0(FP), AX 748 SUBQ SP, AX 749 SUBQ AX, DI 750 MOVQ DI, SP 751 752 MOVQ R8, 8(SP) 753 MOVQ BX, 0(SP) 754 CALL runtime·cgocallbackg(SB) 755 MOVQ 8(SP), R8 756 757 // Compute the size of the frame again. FP and SP have 758 // completely different values here than they did above, 759 // but only their difference matters. 760 LEAQ fv+0(FP), AX 761 SUBQ SP, AX 762 763 // Restore g->sched (== m->curg->sched) from saved values. 764 get_tls(CX) 765 MOVQ g(CX), SI 766 MOVQ SP, DI 767 ADDQ AX, DI 768 MOVQ -8(DI), BX 769 MOVQ BX, (g_sched+gobuf_pc)(SI) 770 MOVQ DI, (g_sched+gobuf_sp)(SI) 771 772 // Switch back to m->g0's stack and restore m->g0->sched.sp. 773 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 774 // so we do not have to restore it.) 775 MOVQ g(CX), BX 776 MOVQ g_m(BX), BX 777 MOVQ m_g0(BX), SI 778 MOVQ SI, g(CX) 779 MOVQ (g_sched+gobuf_sp)(SI), SP 780 MOVQ 0(SP), AX 781 MOVQ AX, (g_sched+gobuf_sp)(SI) 782 783 // If the m on entry was nil, we called needm above to borrow an m 784 // for the duration of the call. Since the call is over, return it with dropm. 785 CMPQ R8, $0 786 JNE 3(PC) 787 MOVQ $runtime·dropm(SB), AX 788 CALL AX 789 790 // Done! 791 RET 792 793 // void setg(G*); set g. for use by needm. 794 TEXT runtime·setg(SB), NOSPLIT, $0-8 795 MOVQ gg+0(FP), BX 796 #ifdef GOOS_windows 797 CMPQ BX, $0 798 JNE settls 799 MOVQ $0, 0x28(GS) 800 RET 801 settls: 802 MOVQ g_m(BX), AX 803 LEAQ m_tls(AX), AX 804 MOVQ AX, 0x28(GS) 805 #endif 806 get_tls(CX) 807 MOVQ BX, g(CX) 808 RET 809 810 // void setg_gcc(G*); set g called from gcc. 811 TEXT setg_gcc<>(SB),NOSPLIT,$0 812 get_tls(AX) 813 MOVQ DI, g(AX) 814 RET 815 816 // check that SP is in range [g->stack.lo, g->stack.hi) 817 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 818 get_tls(CX) 819 MOVQ g(CX), AX 820 CMPQ (g_stack+stack_hi)(AX), SP 821 JHI 2(PC) 822 INT $3 823 CMPQ SP, (g_stack+stack_lo)(AX) 824 JHI 2(PC) 825 INT $3 826 RET 827 828 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 829 MOVQ argp+0(FP),AX // addr of first arg 830 MOVQ -8(AX),AX // get calling pc 831 CMPQ AX, runtime·stackBarrierPC(SB) 832 JNE nobar 833 // Get original return PC. 834 CALL runtime·nextBarrierPC(SB) 835 MOVQ 0(SP), AX 836 nobar: 837 MOVQ AX, ret+8(FP) 838 RET 839 840 TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16 841 MOVQ argp+0(FP),AX // addr of first arg 842 MOVQ pc+8(FP), BX 843 MOVQ -8(AX), CX 844 CMPQ CX, runtime·stackBarrierPC(SB) 845 JEQ setbar 846 MOVQ BX, -8(AX) // set calling pc 847 RET 848 setbar: 849 // Set the stack barrier return PC. 850 MOVQ BX, 0(SP) 851 CALL runtime·setNextBarrierPC(SB) 852 RET 853 854 // func cputicks() int64 855 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 856 CMPB runtime·lfenceBeforeRdtsc(SB), $1 857 JNE mfence 858 LFENCE 859 JMP done 860 mfence: 861 MFENCE 862 done: 863 RDTSC 864 SHLQ $32, DX 865 ADDQ DX, AX 866 MOVQ AX, ret+0(FP) 867 RET 868 869 // memhash_varlen(p unsafe.Pointer, h seed) uintptr 870 // redirects to memhash(p, h, size) using the size 871 // stored in the closure. 872 TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24 873 GO_ARGS 874 NO_LOCAL_POINTERS 875 MOVQ p+0(FP), AX 876 MOVQ h+8(FP), BX 877 MOVQ 8(DX), CX 878 MOVQ AX, 0(SP) 879 MOVQ BX, 8(SP) 880 MOVQ CX, 16(SP) 881 CALL runtime·memhash(SB) 882 MOVQ 24(SP), AX 883 MOVQ AX, ret+16(FP) 884 RET 885 886 // hash function using AES hardware instructions 887 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 888 MOVQ p+0(FP), AX // ptr to data 889 MOVQ s+16(FP), CX // size 890 LEAQ ret+24(FP), DX 891 JMP runtime·aeshashbody(SB) 892 893 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 894 MOVQ p+0(FP), AX // ptr to string struct 895 MOVQ 8(AX), CX // length of string 896 MOVQ (AX), AX // string data 897 LEAQ ret+16(FP), DX 898 JMP runtime·aeshashbody(SB) 899 900 // AX: data 901 // CX: length 902 // DX: address to put return value 903 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 904 // Fill an SSE register with our seeds. 905 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 906 PINSRW $4, CX, X0 // 16 bits of length 907 PSHUFHW $0, X0, X0 // repeat length 4 times total 908 MOVO X0, X1 // save unscrambled seed 909 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 910 AESENC X0, X0 // scramble seed 911 912 CMPQ CX, $16 913 JB aes0to15 914 JE aes16 915 CMPQ CX, $32 916 JBE aes17to32 917 CMPQ CX, $64 918 JBE aes33to64 919 CMPQ CX, $128 920 JBE aes65to128 921 JMP aes129plus 922 923 aes0to15: 924 TESTQ CX, CX 925 JE aes0 926 927 ADDQ $16, AX 928 TESTW $0xff0, AX 929 JE endofpage 930 931 // 16 bytes loaded at this address won't cross 932 // a page boundary, so we can load it directly. 933 MOVOU -16(AX), X1 934 ADDQ CX, CX 935 MOVQ $masks<>(SB), AX 936 PAND (AX)(CX*8), X1 937 final1: 938 PXOR X0, X1 // xor data with seed 939 AESENC X1, X1 // scramble combo 3 times 940 AESENC X1, X1 941 AESENC X1, X1 942 MOVQ X1, (DX) 943 RET 944 945 endofpage: 946 // address ends in 1111xxxx. Might be up against 947 // a page boundary, so load ending at last byte. 948 // Then shift bytes down using pshufb. 949 MOVOU -32(AX)(CX*1), X1 950 ADDQ CX, CX 951 MOVQ $shifts<>(SB), AX 952 PSHUFB (AX)(CX*8), X1 953 JMP final1 954 955 aes0: 956 // Return scrambled input seed 957 AESENC X0, X0 958 MOVQ X0, (DX) 959 RET 960 961 aes16: 962 MOVOU (AX), X1 963 JMP final1 964 965 aes17to32: 966 // make second starting seed 967 PXOR runtime·aeskeysched+16(SB), X1 968 AESENC X1, X1 969 970 // load data to be hashed 971 MOVOU (AX), X2 972 MOVOU -16(AX)(CX*1), X3 973 974 // xor with seed 975 PXOR X0, X2 976 PXOR X1, X3 977 978 // scramble 3 times 979 AESENC X2, X2 980 AESENC X3, X3 981 AESENC X2, X2 982 AESENC X3, X3 983 AESENC X2, X2 984 AESENC X3, X3 985 986 // combine results 987 PXOR X3, X2 988 MOVQ X2, (DX) 989 RET 990 991 aes33to64: 992 // make 3 more starting seeds 993 MOVO X1, X2 994 MOVO X1, X3 995 PXOR runtime·aeskeysched+16(SB), X1 996 PXOR runtime·aeskeysched+32(SB), X2 997 PXOR runtime·aeskeysched+48(SB), X3 998 AESENC X1, X1 999 AESENC X2, X2 1000 AESENC X3, X3 1001 1002 MOVOU (AX), X4 1003 MOVOU 16(AX), X5 1004 MOVOU -32(AX)(CX*1), X6 1005 MOVOU -16(AX)(CX*1), X7 1006 1007 PXOR X0, X4 1008 PXOR X1, X5 1009 PXOR X2, X6 1010 PXOR X3, X7 1011 1012 AESENC X4, X4 1013 AESENC X5, X5 1014 AESENC X6, X6 1015 AESENC X7, X7 1016 1017 AESENC X4, X4 1018 AESENC X5, X5 1019 AESENC X6, X6 1020 AESENC X7, X7 1021 1022 AESENC X4, X4 1023 AESENC X5, X5 1024 AESENC X6, X6 1025 AESENC X7, X7 1026 1027 PXOR X6, X4 1028 PXOR X7, X5 1029 PXOR X5, X4 1030 MOVQ X4, (DX) 1031 RET 1032 1033 aes65to128: 1034 // make 7 more starting seeds 1035 MOVO X1, X2 1036 MOVO X1, X3 1037 MOVO X1, X4 1038 MOVO X1, X5 1039 MOVO X1, X6 1040 MOVO X1, X7 1041 PXOR runtime·aeskeysched+16(SB), X1 1042 PXOR runtime·aeskeysched+32(SB), X2 1043 PXOR runtime·aeskeysched+48(SB), X3 1044 PXOR runtime·aeskeysched+64(SB), X4 1045 PXOR runtime·aeskeysched+80(SB), X5 1046 PXOR runtime·aeskeysched+96(SB), X6 1047 PXOR runtime·aeskeysched+112(SB), X7 1048 AESENC X1, X1 1049 AESENC X2, X2 1050 AESENC X3, X3 1051 AESENC X4, X4 1052 AESENC X5, X5 1053 AESENC X6, X6 1054 AESENC X7, X7 1055 1056 // load data 1057 MOVOU (AX), X8 1058 MOVOU 16(AX), X9 1059 MOVOU 32(AX), X10 1060 MOVOU 48(AX), X11 1061 MOVOU -64(AX)(CX*1), X12 1062 MOVOU -48(AX)(CX*1), X13 1063 MOVOU -32(AX)(CX*1), X14 1064 MOVOU -16(AX)(CX*1), X15 1065 1066 // xor with seed 1067 PXOR X0, X8 1068 PXOR X1, X9 1069 PXOR X2, X10 1070 PXOR X3, X11 1071 PXOR X4, X12 1072 PXOR X5, X13 1073 PXOR X6, X14 1074 PXOR X7, X15 1075 1076 // scramble 3 times 1077 AESENC X8, X8 1078 AESENC X9, X9 1079 AESENC X10, X10 1080 AESENC X11, X11 1081 AESENC X12, X12 1082 AESENC X13, X13 1083 AESENC X14, X14 1084 AESENC X15, X15 1085 1086 AESENC X8, X8 1087 AESENC X9, X9 1088 AESENC X10, X10 1089 AESENC X11, X11 1090 AESENC X12, X12 1091 AESENC X13, X13 1092 AESENC X14, X14 1093 AESENC X15, X15 1094 1095 AESENC X8, X8 1096 AESENC X9, X9 1097 AESENC X10, X10 1098 AESENC X11, X11 1099 AESENC X12, X12 1100 AESENC X13, X13 1101 AESENC X14, X14 1102 AESENC X15, X15 1103 1104 // combine results 1105 PXOR X12, X8 1106 PXOR X13, X9 1107 PXOR X14, X10 1108 PXOR X15, X11 1109 PXOR X10, X8 1110 PXOR X11, X9 1111 PXOR X9, X8 1112 MOVQ X8, (DX) 1113 RET 1114 1115 aes129plus: 1116 // make 7 more starting seeds 1117 MOVO X1, X2 1118 MOVO X1, X3 1119 MOVO X1, X4 1120 MOVO X1, X5 1121 MOVO X1, X6 1122 MOVO X1, X7 1123 PXOR runtime·aeskeysched+16(SB), X1 1124 PXOR runtime·aeskeysched+32(SB), X2 1125 PXOR runtime·aeskeysched+48(SB), X3 1126 PXOR runtime·aeskeysched+64(SB), X4 1127 PXOR runtime·aeskeysched+80(SB), X5 1128 PXOR runtime·aeskeysched+96(SB), X6 1129 PXOR runtime·aeskeysched+112(SB), X7 1130 AESENC X1, X1 1131 AESENC X2, X2 1132 AESENC X3, X3 1133 AESENC X4, X4 1134 AESENC X5, X5 1135 AESENC X6, X6 1136 AESENC X7, X7 1137 1138 // start with last (possibly overlapping) block 1139 MOVOU -128(AX)(CX*1), X8 1140 MOVOU -112(AX)(CX*1), X9 1141 MOVOU -96(AX)(CX*1), X10 1142 MOVOU -80(AX)(CX*1), X11 1143 MOVOU -64(AX)(CX*1), X12 1144 MOVOU -48(AX)(CX*1), X13 1145 MOVOU -32(AX)(CX*1), X14 1146 MOVOU -16(AX)(CX*1), X15 1147 1148 // xor in seed 1149 PXOR X0, X8 1150 PXOR X1, X9 1151 PXOR X2, X10 1152 PXOR X3, X11 1153 PXOR X4, X12 1154 PXOR X5, X13 1155 PXOR X6, X14 1156 PXOR X7, X15 1157 1158 // compute number of remaining 128-byte blocks 1159 DECQ CX 1160 SHRQ $7, CX 1161 1162 aesloop: 1163 // scramble state 1164 AESENC X8, X8 1165 AESENC X9, X9 1166 AESENC X10, X10 1167 AESENC X11, X11 1168 AESENC X12, X12 1169 AESENC X13, X13 1170 AESENC X14, X14 1171 AESENC X15, X15 1172 1173 // scramble state, xor in a block 1174 MOVOU (AX), X0 1175 MOVOU 16(AX), X1 1176 MOVOU 32(AX), X2 1177 MOVOU 48(AX), X3 1178 AESENC X0, X8 1179 AESENC X1, X9 1180 AESENC X2, X10 1181 AESENC X3, X11 1182 MOVOU 64(AX), X4 1183 MOVOU 80(AX), X5 1184 MOVOU 96(AX), X6 1185 MOVOU 112(AX), X7 1186 AESENC X4, X12 1187 AESENC X5, X13 1188 AESENC X6, X14 1189 AESENC X7, X15 1190 1191 ADDQ $128, AX 1192 DECQ CX 1193 JNE aesloop 1194 1195 // 3 more scrambles to finish 1196 AESENC X8, X8 1197 AESENC X9, X9 1198 AESENC X10, X10 1199 AESENC X11, X11 1200 AESENC X12, X12 1201 AESENC X13, X13 1202 AESENC X14, X14 1203 AESENC X15, X15 1204 AESENC X8, X8 1205 AESENC X9, X9 1206 AESENC X10, X10 1207 AESENC X11, X11 1208 AESENC X12, X12 1209 AESENC X13, X13 1210 AESENC X14, X14 1211 AESENC X15, X15 1212 AESENC X8, X8 1213 AESENC X9, X9 1214 AESENC X10, X10 1215 AESENC X11, X11 1216 AESENC X12, X12 1217 AESENC X13, X13 1218 AESENC X14, X14 1219 AESENC X15, X15 1220 1221 PXOR X12, X8 1222 PXOR X13, X9 1223 PXOR X14, X10 1224 PXOR X15, X11 1225 PXOR X10, X8 1226 PXOR X11, X9 1227 PXOR X9, X8 1228 MOVQ X8, (DX) 1229 RET 1230 1231 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1232 MOVQ p+0(FP), AX // ptr to data 1233 MOVQ h+8(FP), X0 // seed 1234 PINSRD $2, (AX), X0 // data 1235 AESENC runtime·aeskeysched+0(SB), X0 1236 AESENC runtime·aeskeysched+16(SB), X0 1237 AESENC runtime·aeskeysched+32(SB), X0 1238 MOVQ X0, ret+16(FP) 1239 RET 1240 1241 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1242 MOVQ p+0(FP), AX // ptr to data 1243 MOVQ h+8(FP), X0 // seed 1244 PINSRQ $1, (AX), X0 // data 1245 AESENC runtime·aeskeysched+0(SB), X0 1246 AESENC runtime·aeskeysched+16(SB), X0 1247 AESENC runtime·aeskeysched+32(SB), X0 1248 MOVQ X0, ret+16(FP) 1249 RET 1250 1251 // simple mask to get rid of data in the high part of the register. 1252 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1253 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1254 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1255 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1256 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1257 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1258 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1259 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1260 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1261 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1262 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1263 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1264 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1265 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1266 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1267 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1268 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1269 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1270 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1271 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1272 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1273 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1274 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1275 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1276 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1277 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1278 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1279 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1280 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1281 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1282 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1283 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1284 GLOBL masks<>(SB),RODATA,$256 1285 1286 TEXT ·checkASM(SB),NOSPLIT,$0-1 1287 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte 1288 MOVQ $masks<>(SB), AX 1289 MOVQ $shifts<>(SB), BX 1290 ORQ BX, AX 1291 TESTQ $15, AX 1292 SETEQ ret+0(FP) 1293 RET 1294 1295 // these are arguments to pshufb. They move data down from 1296 // the high bytes of the register to the low bytes of the register. 1297 // index is how many bytes to move. 1298 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1299 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1300 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1301 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1302 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1303 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1304 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1305 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1306 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1307 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1308 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1309 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1310 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1311 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1312 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1313 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1314 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1315 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1316 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1317 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1318 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1319 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1320 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1321 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1322 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1323 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1324 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1325 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1326 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1327 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1328 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1329 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1330 GLOBL shifts<>(SB),RODATA,$256 1331 1332 // memequal(p, q unsafe.Pointer, size uintptr) bool 1333 TEXT runtime·memequal(SB),NOSPLIT,$0-25 1334 MOVQ a+0(FP), SI 1335 MOVQ b+8(FP), DI 1336 CMPQ SI, DI 1337 JEQ eq 1338 MOVQ size+16(FP), BX 1339 LEAQ ret+24(FP), AX 1340 JMP runtime·memeqbody(SB) 1341 eq: 1342 MOVB $1, ret+24(FP) 1343 RET 1344 1345 // memequal_varlen(a, b unsafe.Pointer) bool 1346 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1347 MOVQ a+0(FP), SI 1348 MOVQ b+8(FP), DI 1349 CMPQ SI, DI 1350 JEQ eq 1351 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1352 LEAQ ret+16(FP), AX 1353 JMP runtime·memeqbody(SB) 1354 eq: 1355 MOVB $1, ret+16(FP) 1356 RET 1357 1358 // eqstring tests whether two strings are equal. 1359 // The compiler guarantees that strings passed 1360 // to eqstring have equal length. 1361 // See runtime_test.go:eqstring_generic for 1362 // equivalent Go code. 1363 TEXT runtime·eqstring(SB),NOSPLIT,$0-33 1364 MOVQ s1_base+0(FP), SI 1365 MOVQ s2_base+16(FP), DI 1366 CMPQ SI, DI 1367 JEQ eq 1368 MOVQ s1_len+8(FP), BX 1369 LEAQ ret+32(FP), AX 1370 JMP runtime·memeqbody(SB) 1371 eq: 1372 MOVB $1, ret+32(FP) 1373 RET 1374 1375 // a in SI 1376 // b in DI 1377 // count in BX 1378 // address of result byte in AX 1379 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1380 CMPQ BX, $8 1381 JB small 1382 CMPQ BX, $64 1383 JB bigloop 1384 CMPB runtime·support_avx2(SB), $1 1385 JE hugeloop_avx2 1386 1387 // 64 bytes at a time using xmm registers 1388 hugeloop: 1389 CMPQ BX, $64 1390 JB bigloop 1391 MOVOU (SI), X0 1392 MOVOU (DI), X1 1393 MOVOU 16(SI), X2 1394 MOVOU 16(DI), X3 1395 MOVOU 32(SI), X4 1396 MOVOU 32(DI), X5 1397 MOVOU 48(SI), X6 1398 MOVOU 48(DI), X7 1399 PCMPEQB X1, X0 1400 PCMPEQB X3, X2 1401 PCMPEQB X5, X4 1402 PCMPEQB X7, X6 1403 PAND X2, X0 1404 PAND X6, X4 1405 PAND X4, X0 1406 PMOVMSKB X0, DX 1407 ADDQ $64, SI 1408 ADDQ $64, DI 1409 SUBQ $64, BX 1410 CMPL DX, $0xffff 1411 JEQ hugeloop 1412 MOVB $0, (AX) 1413 RET 1414 1415 // 64 bytes at a time using ymm registers 1416 hugeloop_avx2: 1417 CMPQ BX, $64 1418 JB bigloop_avx2 1419 VMOVDQU (SI), Y0 1420 VMOVDQU (DI), Y1 1421 VMOVDQU 32(SI), Y2 1422 VMOVDQU 32(DI), Y3 1423 VPCMPEQB Y1, Y0, Y4 1424 VPCMPEQB Y2, Y3, Y5 1425 VPAND Y4, Y5, Y6 1426 VPMOVMSKB Y6, DX 1427 ADDQ $64, SI 1428 ADDQ $64, DI 1429 SUBQ $64, BX 1430 CMPL DX, $0xffffffff 1431 JEQ hugeloop_avx2 1432 VZEROUPPER 1433 MOVB $0, (AX) 1434 RET 1435 1436 bigloop_avx2: 1437 VZEROUPPER 1438 1439 // 8 bytes at a time using 64-bit register 1440 bigloop: 1441 CMPQ BX, $8 1442 JBE leftover 1443 MOVQ (SI), CX 1444 MOVQ (DI), DX 1445 ADDQ $8, SI 1446 ADDQ $8, DI 1447 SUBQ $8, BX 1448 CMPQ CX, DX 1449 JEQ bigloop 1450 MOVB $0, (AX) 1451 RET 1452 1453 // remaining 0-8 bytes 1454 leftover: 1455 MOVQ -8(SI)(BX*1), CX 1456 MOVQ -8(DI)(BX*1), DX 1457 CMPQ CX, DX 1458 SETEQ (AX) 1459 RET 1460 1461 small: 1462 CMPQ BX, $0 1463 JEQ equal 1464 1465 LEAQ 0(BX*8), CX 1466 NEGQ CX 1467 1468 CMPB SI, $0xf8 1469 JA si_high 1470 1471 // load at SI won't cross a page boundary. 1472 MOVQ (SI), SI 1473 JMP si_finish 1474 si_high: 1475 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1476 MOVQ -8(SI)(BX*1), SI 1477 SHRQ CX, SI 1478 si_finish: 1479 1480 // same for DI. 1481 CMPB DI, $0xf8 1482 JA di_high 1483 MOVQ (DI), DI 1484 JMP di_finish 1485 di_high: 1486 MOVQ -8(DI)(BX*1), DI 1487 SHRQ CX, DI 1488 di_finish: 1489 1490 SUBQ SI, DI 1491 SHLQ CX, DI 1492 equal: 1493 SETEQ (AX) 1494 RET 1495 1496 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1497 MOVQ s1_base+0(FP), SI 1498 MOVQ s1_len+8(FP), BX 1499 MOVQ s2_base+16(FP), DI 1500 MOVQ s2_len+24(FP), DX 1501 LEAQ ret+32(FP), R9 1502 JMP runtime·cmpbody(SB) 1503 1504 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1505 MOVQ s1+0(FP), SI 1506 MOVQ s1+8(FP), BX 1507 MOVQ s2+24(FP), DI 1508 MOVQ s2+32(FP), DX 1509 LEAQ res+48(FP), R9 1510 JMP runtime·cmpbody(SB) 1511 1512 // input: 1513 // SI = a 1514 // DI = b 1515 // BX = alen 1516 // DX = blen 1517 // R9 = address of output word (stores -1/0/1 here) 1518 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1519 CMPQ SI, DI 1520 JEQ allsame 1521 CMPQ BX, DX 1522 MOVQ DX, R8 1523 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1524 CMPQ R8, $8 1525 JB small 1526 1527 CMPQ R8, $63 1528 JBE loop 1529 CMPB runtime·support_avx2(SB), $1 1530 JEQ big_loop_avx2 1531 JMP big_loop 1532 loop: 1533 CMPQ R8, $16 1534 JBE _0through16 1535 MOVOU (SI), X0 1536 MOVOU (DI), X1 1537 PCMPEQB X0, X1 1538 PMOVMSKB X1, AX 1539 XORQ $0xffff, AX // convert EQ to NE 1540 JNE diff16 // branch if at least one byte is not equal 1541 ADDQ $16, SI 1542 ADDQ $16, DI 1543 SUBQ $16, R8 1544 JMP loop 1545 1546 diff64: 1547 ADDQ $48, SI 1548 ADDQ $48, DI 1549 JMP diff16 1550 diff48: 1551 ADDQ $32, SI 1552 ADDQ $32, DI 1553 JMP diff16 1554 diff32: 1555 ADDQ $16, SI 1556 ADDQ $16, DI 1557 // AX = bit mask of differences 1558 diff16: 1559 BSFQ AX, BX // index of first byte that differs 1560 XORQ AX, AX 1561 MOVB (SI)(BX*1), CX 1562 CMPB CX, (DI)(BX*1) 1563 SETHI AX 1564 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1565 MOVQ AX, (R9) 1566 RET 1567 1568 // 0 through 16 bytes left, alen>=8, blen>=8 1569 _0through16: 1570 CMPQ R8, $8 1571 JBE _0through8 1572 MOVQ (SI), AX 1573 MOVQ (DI), CX 1574 CMPQ AX, CX 1575 JNE diff8 1576 _0through8: 1577 MOVQ -8(SI)(R8*1), AX 1578 MOVQ -8(DI)(R8*1), CX 1579 CMPQ AX, CX 1580 JEQ allsame 1581 1582 // AX and CX contain parts of a and b that differ. 1583 diff8: 1584 BSWAPQ AX // reverse order of bytes 1585 BSWAPQ CX 1586 XORQ AX, CX 1587 BSRQ CX, CX // index of highest bit difference 1588 SHRQ CX, AX // move a's bit to bottom 1589 ANDQ $1, AX // mask bit 1590 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1591 MOVQ AX, (R9) 1592 RET 1593 1594 // 0-7 bytes in common 1595 small: 1596 LEAQ (R8*8), CX // bytes left -> bits left 1597 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1598 JEQ allsame 1599 1600 // load bytes of a into high bytes of AX 1601 CMPB SI, $0xf8 1602 JA si_high 1603 MOVQ (SI), SI 1604 JMP si_finish 1605 si_high: 1606 MOVQ -8(SI)(R8*1), SI 1607 SHRQ CX, SI 1608 si_finish: 1609 SHLQ CX, SI 1610 1611 // load bytes of b in to high bytes of BX 1612 CMPB DI, $0xf8 1613 JA di_high 1614 MOVQ (DI), DI 1615 JMP di_finish 1616 di_high: 1617 MOVQ -8(DI)(R8*1), DI 1618 SHRQ CX, DI 1619 di_finish: 1620 SHLQ CX, DI 1621 1622 BSWAPQ SI // reverse order of bytes 1623 BSWAPQ DI 1624 XORQ SI, DI // find bit differences 1625 JEQ allsame 1626 BSRQ DI, CX // index of highest bit difference 1627 SHRQ CX, SI // move a's bit to bottom 1628 ANDQ $1, SI // mask bit 1629 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1630 MOVQ AX, (R9) 1631 RET 1632 1633 allsame: 1634 XORQ AX, AX 1635 XORQ CX, CX 1636 CMPQ BX, DX 1637 SETGT AX // 1 if alen > blen 1638 SETEQ CX // 1 if alen == blen 1639 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1640 MOVQ AX, (R9) 1641 RET 1642 1643 // this works for >= 64 bytes of data. 1644 big_loop: 1645 MOVOU (SI), X0 1646 MOVOU (DI), X1 1647 PCMPEQB X0, X1 1648 PMOVMSKB X1, AX 1649 XORQ $0xffff, AX 1650 JNE diff16 1651 1652 MOVOU 16(SI), X0 1653 MOVOU 16(DI), X1 1654 PCMPEQB X0, X1 1655 PMOVMSKB X1, AX 1656 XORQ $0xffff, AX 1657 JNE diff32 1658 1659 MOVOU 32(SI), X0 1660 MOVOU 32(DI), X1 1661 PCMPEQB X0, X1 1662 PMOVMSKB X1, AX 1663 XORQ $0xffff, AX 1664 JNE diff48 1665 1666 MOVOU 48(SI), X0 1667 MOVOU 48(DI), X1 1668 PCMPEQB X0, X1 1669 PMOVMSKB X1, AX 1670 XORQ $0xffff, AX 1671 JNE diff64 1672 1673 ADDQ $64, SI 1674 ADDQ $64, DI 1675 SUBQ $64, R8 1676 CMPQ R8, $64 1677 JBE loop 1678 JMP big_loop 1679 1680 // Compare 64-bytes per loop iteration. 1681 // Loop is unrolled and uses AVX2. 1682 big_loop_avx2: 1683 VMOVDQU (SI), Y2 1684 VMOVDQU (DI), Y3 1685 VMOVDQU 32(SI), Y4 1686 VMOVDQU 32(DI), Y5 1687 VPCMPEQB Y2, Y3, Y0 1688 VPMOVMSKB Y0, AX 1689 XORL $0xffffffff, AX 1690 JNE diff32_avx2 1691 VPCMPEQB Y4, Y5, Y6 1692 VPMOVMSKB Y6, AX 1693 XORL $0xffffffff, AX 1694 JNE diff64_avx2 1695 1696 ADDQ $64, SI 1697 ADDQ $64, DI 1698 SUBQ $64, R8 1699 CMPQ R8, $64 1700 JB big_loop_avx2_exit 1701 JMP big_loop_avx2 1702 1703 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 1704 diff32_avx2: 1705 VZEROUPPER 1706 JMP diff16 1707 1708 // Same as diff32_avx2, but for last 32 bytes. 1709 diff64_avx2: 1710 VZEROUPPER 1711 JMP diff48 1712 1713 // For <64 bytes remainder jump to normal loop. 1714 big_loop_avx2_exit: 1715 VZEROUPPER 1716 JMP loop 1717 1718 1719 TEXT strings·supportAVX2(SB),NOSPLIT,$0-1 1720 MOVBLZX runtime·support_avx2(SB), AX 1721 MOVB AX, ret+0(FP) 1722 RET 1723 1724 TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1 1725 MOVBLZX runtime·support_avx2(SB), AX 1726 MOVB AX, ret+0(FP) 1727 RET 1728 1729 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 1730 MOVQ s+0(FP), DI 1731 // We want len in DX and AX, because PCMPESTRI implicitly consumes them 1732 MOVQ s_len+8(FP), DX 1733 MOVQ c+16(FP), BP 1734 MOVQ c_len+24(FP), AX 1735 MOVQ DI, R10 1736 LEAQ ret+32(FP), R11 1737 JMP runtime·indexShortStr(SB) 1738 1739 TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56 1740 MOVQ s+0(FP), DI 1741 MOVQ s_len+8(FP), DX 1742 MOVQ c+24(FP), BP 1743 MOVQ c_len+32(FP), AX 1744 MOVQ DI, R10 1745 LEAQ ret+48(FP), R11 1746 JMP runtime·indexShortStr(SB) 1747 1748 // AX: length of string, that we are searching for 1749 // DX: length of string, in which we are searching 1750 // DI: pointer to string, in which we are searching 1751 // BP: pointer to string, that we are searching for 1752 // R11: address, where to put return value 1753 TEXT runtime·indexShortStr(SB),NOSPLIT,$0 1754 CMPQ AX, DX 1755 JA fail 1756 CMPQ DX, $16 1757 JAE sse42 1758 no_sse42: 1759 CMPQ AX, $2 1760 JA _3_or_more 1761 MOVW (BP), BP 1762 LEAQ -1(DI)(DX*1), DX 1763 loop2: 1764 MOVW (DI), SI 1765 CMPW SI,BP 1766 JZ success 1767 ADDQ $1,DI 1768 CMPQ DI,DX 1769 JB loop2 1770 JMP fail 1771 _3_or_more: 1772 CMPQ AX, $3 1773 JA _4_or_more 1774 MOVW 1(BP), BX 1775 MOVW (BP), BP 1776 LEAQ -2(DI)(DX*1), DX 1777 loop3: 1778 MOVW (DI), SI 1779 CMPW SI,BP 1780 JZ partial_success3 1781 ADDQ $1,DI 1782 CMPQ DI,DX 1783 JB loop3 1784 JMP fail 1785 partial_success3: 1786 MOVW 1(DI), SI 1787 CMPW SI,BX 1788 JZ success 1789 ADDQ $1,DI 1790 CMPQ DI,DX 1791 JB loop3 1792 JMP fail 1793 _4_or_more: 1794 CMPQ AX, $4 1795 JA _5_or_more 1796 MOVL (BP), BP 1797 LEAQ -3(DI)(DX*1), DX 1798 loop4: 1799 MOVL (DI), SI 1800 CMPL SI,BP 1801 JZ success 1802 ADDQ $1,DI 1803 CMPQ DI,DX 1804 JB loop4 1805 JMP fail 1806 _5_or_more: 1807 CMPQ AX, $7 1808 JA _8_or_more 1809 LEAQ 1(DI)(DX*1), DX 1810 SUBQ AX, DX 1811 MOVL -4(BP)(AX*1), BX 1812 MOVL (BP), BP 1813 loop5to7: 1814 MOVL (DI), SI 1815 CMPL SI,BP 1816 JZ partial_success5to7 1817 ADDQ $1,DI 1818 CMPQ DI,DX 1819 JB loop5to7 1820 JMP fail 1821 partial_success5to7: 1822 MOVL -4(AX)(DI*1), SI 1823 CMPL SI,BX 1824 JZ success 1825 ADDQ $1,DI 1826 CMPQ DI,DX 1827 JB loop5to7 1828 JMP fail 1829 _8_or_more: 1830 CMPQ AX, $8 1831 JA _9_or_more 1832 MOVQ (BP), BP 1833 LEAQ -7(DI)(DX*1), DX 1834 loop8: 1835 MOVQ (DI), SI 1836 CMPQ SI,BP 1837 JZ success 1838 ADDQ $1,DI 1839 CMPQ DI,DX 1840 JB loop8 1841 JMP fail 1842 _9_or_more: 1843 CMPQ AX, $15 1844 JA _16_or_more 1845 LEAQ 1(DI)(DX*1), DX 1846 SUBQ AX, DX 1847 MOVQ -8(BP)(AX*1), BX 1848 MOVQ (BP), BP 1849 loop9to15: 1850 MOVQ (DI), SI 1851 CMPQ SI,BP 1852 JZ partial_success9to15 1853 ADDQ $1,DI 1854 CMPQ DI,DX 1855 JB loop9to15 1856 JMP fail 1857 partial_success9to15: 1858 MOVQ -8(AX)(DI*1), SI 1859 CMPQ SI,BX 1860 JZ success 1861 ADDQ $1,DI 1862 CMPQ DI,DX 1863 JB loop9to15 1864 JMP fail 1865 _16_or_more: 1866 CMPQ AX, $16 1867 JA _17_or_more 1868 MOVOU (BP), X1 1869 LEAQ -15(DI)(DX*1), DX 1870 loop16: 1871 MOVOU (DI), X2 1872 PCMPEQB X1, X2 1873 PMOVMSKB X2, SI 1874 CMPQ SI, $0xffff 1875 JE success 1876 ADDQ $1,DI 1877 CMPQ DI,DX 1878 JB loop16 1879 JMP fail 1880 _17_or_more: 1881 CMPQ AX, $31 1882 JA _32_or_more 1883 LEAQ 1(DI)(DX*1), DX 1884 SUBQ AX, DX 1885 MOVOU -16(BP)(AX*1), X0 1886 MOVOU (BP), X1 1887 loop17to31: 1888 MOVOU (DI), X2 1889 PCMPEQB X1,X2 1890 PMOVMSKB X2, SI 1891 CMPQ SI, $0xffff 1892 JE partial_success17to31 1893 ADDQ $1,DI 1894 CMPQ DI,DX 1895 JB loop17to31 1896 JMP fail 1897 partial_success17to31: 1898 MOVOU -16(AX)(DI*1), X3 1899 PCMPEQB X0, X3 1900 PMOVMSKB X3, SI 1901 CMPQ SI, $0xffff 1902 JE success 1903 ADDQ $1,DI 1904 CMPQ DI,DX 1905 JB loop17to31 1906 JMP fail 1907 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 1908 // So no need to check cpuid 1909 _32_or_more: 1910 CMPQ AX, $32 1911 JA _33_to_63 1912 VMOVDQU (BP), Y1 1913 LEAQ -31(DI)(DX*1), DX 1914 loop32: 1915 VMOVDQU (DI), Y2 1916 VPCMPEQB Y1, Y2, Y3 1917 VPMOVMSKB Y3, SI 1918 CMPL SI, $0xffffffff 1919 JE success_avx2 1920 ADDQ $1,DI 1921 CMPQ DI,DX 1922 JB loop32 1923 JMP fail_avx2 1924 _33_to_63: 1925 LEAQ 1(DI)(DX*1), DX 1926 SUBQ AX, DX 1927 VMOVDQU -32(BP)(AX*1), Y0 1928 VMOVDQU (BP), Y1 1929 loop33to63: 1930 VMOVDQU (DI), Y2 1931 VPCMPEQB Y1, Y2, Y3 1932 VPMOVMSKB Y3, SI 1933 CMPL SI, $0xffffffff 1934 JE partial_success33to63 1935 ADDQ $1,DI 1936 CMPQ DI,DX 1937 JB loop33to63 1938 JMP fail_avx2 1939 partial_success33to63: 1940 VMOVDQU -32(AX)(DI*1), Y3 1941 VPCMPEQB Y0, Y3, Y4 1942 VPMOVMSKB Y4, SI 1943 CMPL SI, $0xffffffff 1944 JE success_avx2 1945 ADDQ $1,DI 1946 CMPQ DI,DX 1947 JB loop33to63 1948 fail_avx2: 1949 VZEROUPPER 1950 fail: 1951 MOVQ $-1, (R11) 1952 RET 1953 success_avx2: 1954 VZEROUPPER 1955 JMP success 1956 sse42: 1957 MOVL runtime·cpuid_ecx(SB), CX 1958 ANDL $0x100000, CX 1959 JZ no_sse42 1960 CMPQ AX, $12 1961 // PCMPESTRI is slower than normal compare, 1962 // so using it makes sense only if we advance 4+ bytes per compare 1963 // This value was determined experimentally and is the ~same 1964 // on Nehalem (first with SSE42) and Haswell. 1965 JAE _9_or_more 1966 LEAQ 16(BP), SI 1967 TESTW $0xff0, SI 1968 JEQ no_sse42 1969 MOVOU (BP), X1 1970 LEAQ -15(DI)(DX*1), SI 1971 MOVQ $16, R9 1972 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 1973 loop_sse42: 1974 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 1975 // for equality (bits 2,3 are 11) 1976 // result is not masked or inverted (bits 4,5 are 00) 1977 // and corresponds to first matching byte (bit 6 is 0) 1978 PCMPESTRI $0x0c, (DI), X1 1979 // CX == 16 means no match, 1980 // CX > R9 means partial match at the end of the string, 1981 // otherwise sep is at offset CX from X1 start 1982 CMPQ CX, R9 1983 JBE sse42_success 1984 ADDQ R9, DI 1985 CMPQ DI, SI 1986 JB loop_sse42 1987 PCMPESTRI $0x0c, -1(SI), X1 1988 CMPQ CX, R9 1989 JA fail 1990 LEAQ -1(SI), DI 1991 sse42_success: 1992 ADDQ CX, DI 1993 success: 1994 SUBQ R10, DI 1995 MOVQ DI, (R11) 1996 RET 1997 1998 1999 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 2000 MOVQ s+0(FP), SI 2001 MOVQ s_len+8(FP), BX 2002 MOVB c+24(FP), AL 2003 LEAQ ret+32(FP), R8 2004 JMP runtime·indexbytebody(SB) 2005 2006 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 2007 MOVQ s+0(FP), SI 2008 MOVQ s_len+8(FP), BX 2009 MOVB c+16(FP), AL 2010 LEAQ ret+24(FP), R8 2011 JMP runtime·indexbytebody(SB) 2012 2013 // input: 2014 // SI: data 2015 // BX: data len 2016 // AL: byte sought 2017 // R8: address to put result 2018 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 2019 // Shuffle X0 around so that each byte contains 2020 // the character we're looking for. 2021 MOVD AX, X0 2022 PUNPCKLBW X0, X0 2023 PUNPCKLBW X0, X0 2024 PSHUFL $0, X0, X0 2025 2026 CMPQ BX, $16 2027 JLT small 2028 2029 MOVQ SI, DI 2030 2031 CMPQ BX, $32 2032 JA avx2 2033 sse: 2034 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 2035 JMP sseloopentry 2036 2037 sseloop: 2038 // Move the next 16-byte chunk of the data into X1. 2039 MOVOU (DI), X1 2040 // Compare bytes in X0 to X1. 2041 PCMPEQB X0, X1 2042 // Take the top bit of each byte in X1 and put the result in DX. 2043 PMOVMSKB X1, DX 2044 // Find first set bit, if any. 2045 BSFL DX, DX 2046 JNZ ssesuccess 2047 // Advance to next block. 2048 ADDQ $16, DI 2049 sseloopentry: 2050 CMPQ DI, AX 2051 JB sseloop 2052 2053 // Search the last 16-byte chunk. This chunk may overlap with the 2054 // chunks we've already searched, but that's ok. 2055 MOVQ AX, DI 2056 MOVOU (AX), X1 2057 PCMPEQB X0, X1 2058 PMOVMSKB X1, DX 2059 BSFL DX, DX 2060 JNZ ssesuccess 2061 2062 failure: 2063 MOVQ $-1, (R8) 2064 RET 2065 2066 // We've found a chunk containing the byte. 2067 // The chunk was loaded from DI. 2068 // The index of the matching byte in the chunk is DX. 2069 // The start of the data is SI. 2070 ssesuccess: 2071 SUBQ SI, DI // Compute offset of chunk within data. 2072 ADDQ DX, DI // Add offset of byte within chunk. 2073 MOVQ DI, (R8) 2074 RET 2075 2076 // handle for lengths < 16 2077 small: 2078 TESTQ BX, BX 2079 JEQ failure 2080 2081 // Check if we'll load across a page boundary. 2082 LEAQ 16(SI), AX 2083 TESTW $0xff0, AX 2084 JEQ endofpage 2085 2086 MOVOU (SI), X1 // Load data 2087 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2088 PMOVMSKB X1, DX // Move result bits to integer register. 2089 BSFL DX, DX // Find first set bit. 2090 JZ failure // No set bit, failure. 2091 CMPL DX, BX 2092 JAE failure // Match is past end of data. 2093 MOVQ DX, (R8) 2094 RET 2095 2096 endofpage: 2097 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 2098 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2099 PMOVMSKB X1, DX // Move result bits to integer register. 2100 MOVL BX, CX 2101 SHLL CX, DX 2102 SHRL $16, DX // Shift desired bits down to bottom of register. 2103 BSFL DX, DX // Find first set bit. 2104 JZ failure // No set bit, failure. 2105 MOVQ DX, (R8) 2106 RET 2107 2108 avx2: 2109 CMPB runtime·support_avx2(SB), $1 2110 JNE sse 2111 MOVD AX, X0 2112 LEAQ -32(SI)(BX*1), R11 2113 VPBROADCASTB X0, Y1 2114 avx2_loop: 2115 VMOVDQU (DI), Y2 2116 VPCMPEQB Y1, Y2, Y3 2117 VPTEST Y3, Y3 2118 JNZ avx2success 2119 ADDQ $32, DI 2120 CMPQ DI, R11 2121 JLT avx2_loop 2122 MOVQ R11, DI 2123 VMOVDQU (DI), Y2 2124 VPCMPEQB Y1, Y2, Y3 2125 VPTEST Y3, Y3 2126 JNZ avx2success 2127 VZEROUPPER 2128 MOVQ $-1, (R8) 2129 RET 2130 2131 avx2success: 2132 VPMOVMSKB Y3, DX 2133 BSFL DX, DX 2134 SUBQ SI, DI 2135 ADDQ DI, DX 2136 MOVQ DX, (R8) 2137 VZEROUPPER 2138 RET 2139 2140 TEXT bytes·Equal(SB),NOSPLIT,$0-49 2141 MOVQ a_len+8(FP), BX 2142 MOVQ b_len+32(FP), CX 2143 CMPQ BX, CX 2144 JNE eqret 2145 MOVQ a+0(FP), SI 2146 MOVQ b+24(FP), DI 2147 LEAQ ret+48(FP), AX 2148 JMP runtime·memeqbody(SB) 2149 eqret: 2150 MOVB $0, ret+48(FP) 2151 RET 2152 2153 TEXT runtime·fastrand(SB), NOSPLIT, $0-4 2154 get_tls(CX) 2155 MOVQ g(CX), AX 2156 MOVQ g_m(AX), AX 2157 MOVL m_fastrand(AX), DX 2158 ADDL DX, DX 2159 MOVL DX, BX 2160 XORL $0x88888eef, DX 2161 CMOVLMI BX, DX 2162 MOVL DX, m_fastrand(AX) 2163 MOVL DX, ret+0(FP) 2164 RET 2165 2166 TEXT runtime·return0(SB), NOSPLIT, $0 2167 MOVL $0, AX 2168 RET 2169 2170 2171 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 2172 // Must obey the gcc calling convention. 2173 TEXT _cgo_topofstack(SB),NOSPLIT,$0 2174 get_tls(CX) 2175 MOVQ g(CX), AX 2176 MOVQ g_m(AX), AX 2177 MOVQ m_curg(AX), AX 2178 MOVQ (g_stack+stack_hi)(AX), AX 2179 RET 2180 2181 // The top-most function running on a goroutine 2182 // returns to goexit+PCQuantum. 2183 TEXT runtime·goexit(SB),NOSPLIT,$0-0 2184 BYTE $0x90 // NOP 2185 CALL runtime·goexit1(SB) // does not return 2186 // traceback from goexit1 must hit code range of goexit 2187 BYTE $0x90 // NOP 2188 2189 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8 2190 MOVQ addr+0(FP), AX 2191 PREFETCHT0 (AX) 2192 RET 2193 2194 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8 2195 MOVQ addr+0(FP), AX 2196 PREFETCHT1 (AX) 2197 RET 2198 2199 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8 2200 MOVQ addr+0(FP), AX 2201 PREFETCHT2 (AX) 2202 RET 2203 2204 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8 2205 MOVQ addr+0(FP), AX 2206 PREFETCHNTA (AX) 2207 RET 2208 2209 // This is called from .init_array and follows the platform, not Go, ABI. 2210 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 2211 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 2212 MOVQ runtime·lastmoduledatap(SB), AX 2213 MOVQ DI, moduledata_next(AX) 2214 MOVQ DI, runtime·lastmoduledatap(SB) 2215 POPQ R15 2216 RET