github.com/sbinet/go@v0.0.0-20160827155028-54d7de7dd62b/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVQ $0, AX 30 CPUID 31 MOVQ AX, SI 32 CMPQ AX, $0 33 JE nocpuinfo 34 35 // Figure out how to serialize RDTSC. 36 // On Intel processors LFENCE is enough. AMD requires MFENCE. 37 // Don't know about the rest, so let's do MFENCE. 38 CMPL BX, $0x756E6547 // "Genu" 39 JNE notintel 40 CMPL DX, $0x49656E69 // "ineI" 41 JNE notintel 42 CMPL CX, $0x6C65746E // "ntel" 43 JNE notintel 44 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 45 notintel: 46 47 // Load EAX=1 cpuid flags 48 MOVQ $1, AX 49 CPUID 50 MOVL CX, runtime·cpuid_ecx(SB) 51 MOVL DX, runtime·cpuid_edx(SB) 52 53 // Load EAX=7/ECX=0 cpuid flags 54 CMPQ SI, $7 55 JLT no7 56 MOVL $7, AX 57 MOVL $0, CX 58 CPUID 59 MOVL BX, runtime·cpuid_ebx7(SB) 60 no7: 61 // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1] 62 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 63 // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf 64 MOVL runtime·cpuid_ecx(SB), CX 65 ANDL $0x18000000, CX // check for OSXSAVE and AVX bits 66 CMPL CX, $0x18000000 67 JNE noavx 68 MOVL $0, CX 69 // For XGETBV, OSXSAVE bit is required and sufficient 70 XGETBV 71 ANDL $6, AX 72 CMPL AX, $6 // Check for OS support of YMM registers 73 JNE noavx 74 MOVB $1, runtime·support_avx(SB) 75 TESTL $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit 76 JEQ noavx2 77 MOVB $1, runtime·support_avx2(SB) 78 JMP nocpuinfo 79 noavx: 80 MOVB $0, runtime·support_avx(SB) 81 noavx2: 82 MOVB $0, runtime·support_avx2(SB) 83 nocpuinfo: 84 85 // if there is an _cgo_init, call it. 86 MOVQ _cgo_init(SB), AX 87 TESTQ AX, AX 88 JZ needtls 89 // g0 already in DI 90 MOVQ DI, CX // Win64 uses CX for first parameter 91 MOVQ $setg_gcc<>(SB), SI 92 CALL AX 93 94 // update stackguard after _cgo_init 95 MOVQ $runtime·g0(SB), CX 96 MOVQ (g_stack+stack_lo)(CX), AX 97 ADDQ $const__StackGuard, AX 98 MOVQ AX, g_stackguard0(CX) 99 MOVQ AX, g_stackguard1(CX) 100 101 #ifndef GOOS_windows 102 JMP ok 103 #endif 104 needtls: 105 #ifdef GOOS_plan9 106 // skip TLS setup on Plan 9 107 JMP ok 108 #endif 109 #ifdef GOOS_solaris 110 // skip TLS setup on Solaris 111 JMP ok 112 #endif 113 114 LEAQ runtime·m0+m_tls(SB), DI 115 CALL runtime·settls(SB) 116 117 // store through it, to make sure it works 118 get_tls(BX) 119 MOVQ $0x123, g(BX) 120 MOVQ runtime·m0+m_tls(SB), AX 121 CMPQ AX, $0x123 122 JEQ 2(PC) 123 MOVL AX, 0 // abort 124 ok: 125 // set the per-goroutine and per-mach "registers" 126 get_tls(BX) 127 LEAQ runtime·g0(SB), CX 128 MOVQ CX, g(BX) 129 LEAQ runtime·m0(SB), AX 130 131 // save m->g0 = g0 132 MOVQ CX, m_g0(AX) 133 // save m0 to g0->m 134 MOVQ AX, g_m(CX) 135 136 CLD // convention is D is always left cleared 137 CALL runtime·check(SB) 138 139 MOVL 16(SP), AX // copy argc 140 MOVL AX, 0(SP) 141 MOVQ 24(SP), AX // copy argv 142 MOVQ AX, 8(SP) 143 CALL runtime·args(SB) 144 CALL runtime·osinit(SB) 145 CALL runtime·schedinit(SB) 146 147 // create a new goroutine to start program 148 MOVQ $runtime·mainPC(SB), AX // entry 149 PUSHQ AX 150 PUSHQ $0 // arg size 151 CALL runtime·newproc(SB) 152 POPQ AX 153 POPQ AX 154 155 // start this M 156 CALL runtime·mstart(SB) 157 158 MOVL $0xf1, 0xf1 // crash 159 RET 160 161 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 162 GLOBL runtime·mainPC(SB),RODATA,$8 163 164 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 165 BYTE $0xcc 166 RET 167 168 TEXT runtime·asminit(SB),NOSPLIT,$0-0 169 // No per-thread init. 170 RET 171 172 /* 173 * go-routine 174 */ 175 176 // void gosave(Gobuf*) 177 // save state in Gobuf; setjmp 178 TEXT runtime·gosave(SB), NOSPLIT, $0-8 179 MOVQ buf+0(FP), AX // gobuf 180 LEAQ buf+0(FP), BX // caller's SP 181 MOVQ BX, gobuf_sp(AX) 182 MOVQ 0(SP), BX // caller's PC 183 MOVQ BX, gobuf_pc(AX) 184 MOVQ $0, gobuf_ret(AX) 185 MOVQ $0, gobuf_ctxt(AX) 186 MOVQ BP, gobuf_bp(AX) 187 get_tls(CX) 188 MOVQ g(CX), BX 189 MOVQ BX, gobuf_g(AX) 190 RET 191 192 // void gogo(Gobuf*) 193 // restore state from Gobuf; longjmp 194 TEXT runtime·gogo(SB), NOSPLIT, $0-8 195 MOVQ buf+0(FP), BX // gobuf 196 MOVQ gobuf_g(BX), DX 197 MOVQ 0(DX), CX // make sure g != nil 198 get_tls(CX) 199 MOVQ DX, g(CX) 200 MOVQ gobuf_sp(BX), SP // restore SP 201 MOVQ gobuf_ret(BX), AX 202 MOVQ gobuf_ctxt(BX), DX 203 MOVQ gobuf_bp(BX), BP 204 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 205 MOVQ $0, gobuf_ret(BX) 206 MOVQ $0, gobuf_ctxt(BX) 207 MOVQ $0, gobuf_bp(BX) 208 MOVQ gobuf_pc(BX), BX 209 JMP BX 210 211 // func mcall(fn func(*g)) 212 // Switch to m->g0's stack, call fn(g). 213 // Fn must never return. It should gogo(&g->sched) 214 // to keep running g. 215 TEXT runtime·mcall(SB), NOSPLIT, $0-8 216 MOVQ fn+0(FP), DI 217 218 get_tls(CX) 219 MOVQ g(CX), AX // save state in g->sched 220 MOVQ 0(SP), BX // caller's PC 221 MOVQ BX, (g_sched+gobuf_pc)(AX) 222 LEAQ fn+0(FP), BX // caller's SP 223 MOVQ BX, (g_sched+gobuf_sp)(AX) 224 MOVQ AX, (g_sched+gobuf_g)(AX) 225 MOVQ BP, (g_sched+gobuf_bp)(AX) 226 227 // switch to m->g0 & its stack, call fn 228 MOVQ g(CX), BX 229 MOVQ g_m(BX), BX 230 MOVQ m_g0(BX), SI 231 CMPQ SI, AX // if g == m->g0 call badmcall 232 JNE 3(PC) 233 MOVQ $runtime·badmcall(SB), AX 234 JMP AX 235 MOVQ SI, g(CX) // g = m->g0 236 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 237 PUSHQ AX 238 MOVQ DI, DX 239 MOVQ 0(DI), DI 240 CALL DI 241 POPQ AX 242 MOVQ $runtime·badmcall2(SB), AX 243 JMP AX 244 RET 245 246 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 247 // of the G stack. We need to distinguish the routine that 248 // lives at the bottom of the G stack from the one that lives 249 // at the top of the system stack because the one at the top of 250 // the system stack terminates the stack walk (see topofstack()). 251 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 252 RET 253 254 // func systemstack(fn func()) 255 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 256 MOVQ fn+0(FP), DI // DI = fn 257 get_tls(CX) 258 MOVQ g(CX), AX // AX = g 259 MOVQ g_m(AX), BX // BX = m 260 261 MOVQ m_gsignal(BX), DX // DX = gsignal 262 CMPQ AX, DX 263 JEQ noswitch 264 265 MOVQ m_g0(BX), DX // DX = g0 266 CMPQ AX, DX 267 JEQ noswitch 268 269 MOVQ m_curg(BX), R8 270 CMPQ AX, R8 271 JEQ switch 272 273 // Bad: g is not gsignal, not g0, not curg. What is it? 274 MOVQ $runtime·badsystemstack(SB), AX 275 CALL AX 276 277 switch: 278 // save our state in g->sched. Pretend to 279 // be systemstack_switch if the G stack is scanned. 280 MOVQ $runtime·systemstack_switch(SB), SI 281 MOVQ SI, (g_sched+gobuf_pc)(AX) 282 MOVQ SP, (g_sched+gobuf_sp)(AX) 283 MOVQ AX, (g_sched+gobuf_g)(AX) 284 MOVQ BP, (g_sched+gobuf_bp)(AX) 285 286 // switch to g0 287 MOVQ DX, g(CX) 288 MOVQ (g_sched+gobuf_sp)(DX), BX 289 // make it look like mstart called systemstack on g0, to stop traceback 290 SUBQ $8, BX 291 MOVQ $runtime·mstart(SB), DX 292 MOVQ DX, 0(BX) 293 MOVQ BX, SP 294 295 // call target function 296 MOVQ DI, DX 297 MOVQ 0(DI), DI 298 CALL DI 299 300 // switch back to g 301 get_tls(CX) 302 MOVQ g(CX), AX 303 MOVQ g_m(AX), BX 304 MOVQ m_curg(BX), AX 305 MOVQ AX, g(CX) 306 MOVQ (g_sched+gobuf_sp)(AX), SP 307 MOVQ $0, (g_sched+gobuf_sp)(AX) 308 RET 309 310 noswitch: 311 // already on m stack, just call directly 312 MOVQ DI, DX 313 MOVQ 0(DI), DI 314 CALL DI 315 RET 316 317 /* 318 * support for morestack 319 */ 320 321 // Called during function prolog when more stack is needed. 322 // 323 // The traceback routines see morestack on a g0 as being 324 // the top of a stack (for example, morestack calling newstack 325 // calling the scheduler calling newm calling gc), so we must 326 // record an argument size. For that purpose, it has no arguments. 327 TEXT runtime·morestack(SB),NOSPLIT,$0-0 328 // Cannot grow scheduler stack (m->g0). 329 get_tls(CX) 330 MOVQ g(CX), BX 331 MOVQ g_m(BX), BX 332 MOVQ m_g0(BX), SI 333 CMPQ g(CX), SI 334 JNE 2(PC) 335 INT $3 336 337 // Cannot grow signal stack (m->gsignal). 338 MOVQ m_gsignal(BX), SI 339 CMPQ g(CX), SI 340 JNE 2(PC) 341 INT $3 342 343 // Called from f. 344 // Set m->morebuf to f's caller. 345 MOVQ 8(SP), AX // f's caller's PC 346 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 347 LEAQ 16(SP), AX // f's caller's SP 348 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 349 get_tls(CX) 350 MOVQ g(CX), SI 351 MOVQ SI, (m_morebuf+gobuf_g)(BX) 352 353 // Set g->sched to context in f. 354 MOVQ 0(SP), AX // f's PC 355 MOVQ AX, (g_sched+gobuf_pc)(SI) 356 MOVQ SI, (g_sched+gobuf_g)(SI) 357 LEAQ 8(SP), AX // f's SP 358 MOVQ AX, (g_sched+gobuf_sp)(SI) 359 MOVQ DX, (g_sched+gobuf_ctxt)(SI) 360 MOVQ BP, (g_sched+gobuf_bp)(SI) 361 362 // Call newstack on m->g0's stack. 363 MOVQ m_g0(BX), BX 364 MOVQ BX, g(CX) 365 MOVQ (g_sched+gobuf_sp)(BX), SP 366 CALL runtime·newstack(SB) 367 MOVQ $0, 0x1003 // crash if newstack returns 368 RET 369 370 // morestack but not preserving ctxt. 371 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 372 MOVL $0, DX 373 JMP runtime·morestack(SB) 374 375 TEXT runtime·stackBarrier(SB),NOSPLIT,$0 376 // We came here via a RET to an overwritten return PC. 377 // AX may be live. Other registers are available. 378 379 // Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal. 380 get_tls(CX) 381 MOVQ g(CX), CX 382 MOVQ (g_stkbar+slice_array)(CX), DX 383 MOVQ g_stkbarPos(CX), BX 384 IMULQ $stkbar__size, BX // Too big for SIB. 385 MOVQ stkbar_savedLRPtr(DX)(BX*1), R8 386 MOVQ stkbar_savedLRVal(DX)(BX*1), BX 387 // Assert that we're popping the right saved LR. 388 ADDQ $8, R8 389 CMPQ R8, SP 390 JEQ 2(PC) 391 MOVL $0, 0 392 // Record that this stack barrier was hit. 393 ADDQ $1, g_stkbarPos(CX) 394 // Jump to the original return PC. 395 JMP BX 396 397 // reflectcall: call a function with the given argument list 398 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 399 // we don't have variable-sized frames, so we use a small number 400 // of constant-sized-frame functions to encode a few bits of size in the pc. 401 // Caution: ugly multiline assembly macros in your future! 402 403 #define DISPATCH(NAME,MAXSIZE) \ 404 CMPQ CX, $MAXSIZE; \ 405 JA 3(PC); \ 406 MOVQ $NAME(SB), AX; \ 407 JMP AX 408 // Note: can't just "JMP NAME(SB)" - bad inlining results. 409 410 TEXT reflect·call(SB), NOSPLIT, $0-0 411 JMP ·reflectcall(SB) 412 413 TEXT ·reflectcall(SB), NOSPLIT, $0-32 414 MOVLQZX argsize+24(FP), CX 415 // NOTE(rsc): No call16, because CALLFN needs four words 416 // of argument space to invoke callwritebarrier. 417 DISPATCH(runtime·call32, 32) 418 DISPATCH(runtime·call64, 64) 419 DISPATCH(runtime·call128, 128) 420 DISPATCH(runtime·call256, 256) 421 DISPATCH(runtime·call512, 512) 422 DISPATCH(runtime·call1024, 1024) 423 DISPATCH(runtime·call2048, 2048) 424 DISPATCH(runtime·call4096, 4096) 425 DISPATCH(runtime·call8192, 8192) 426 DISPATCH(runtime·call16384, 16384) 427 DISPATCH(runtime·call32768, 32768) 428 DISPATCH(runtime·call65536, 65536) 429 DISPATCH(runtime·call131072, 131072) 430 DISPATCH(runtime·call262144, 262144) 431 DISPATCH(runtime·call524288, 524288) 432 DISPATCH(runtime·call1048576, 1048576) 433 DISPATCH(runtime·call2097152, 2097152) 434 DISPATCH(runtime·call4194304, 4194304) 435 DISPATCH(runtime·call8388608, 8388608) 436 DISPATCH(runtime·call16777216, 16777216) 437 DISPATCH(runtime·call33554432, 33554432) 438 DISPATCH(runtime·call67108864, 67108864) 439 DISPATCH(runtime·call134217728, 134217728) 440 DISPATCH(runtime·call268435456, 268435456) 441 DISPATCH(runtime·call536870912, 536870912) 442 DISPATCH(runtime·call1073741824, 1073741824) 443 MOVQ $runtime·badreflectcall(SB), AX 444 JMP AX 445 446 #define CALLFN(NAME,MAXSIZE) \ 447 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 448 NO_LOCAL_POINTERS; \ 449 /* copy arguments to stack */ \ 450 MOVQ argptr+16(FP), SI; \ 451 MOVLQZX argsize+24(FP), CX; \ 452 MOVQ SP, DI; \ 453 REP;MOVSB; \ 454 /* call function */ \ 455 MOVQ f+8(FP), DX; \ 456 PCDATA $PCDATA_StackMapIndex, $0; \ 457 CALL (DX); \ 458 /* copy return values back */ \ 459 MOVQ argptr+16(FP), DI; \ 460 MOVLQZX argsize+24(FP), CX; \ 461 MOVLQZX retoffset+28(FP), BX; \ 462 MOVQ SP, SI; \ 463 ADDQ BX, DI; \ 464 ADDQ BX, SI; \ 465 SUBQ BX, CX; \ 466 REP;MOVSB; \ 467 /* execute write barrier updates */ \ 468 MOVQ argtype+0(FP), DX; \ 469 MOVQ argptr+16(FP), DI; \ 470 MOVLQZX argsize+24(FP), CX; \ 471 MOVLQZX retoffset+28(FP), BX; \ 472 MOVQ DX, 0(SP); \ 473 MOVQ DI, 8(SP); \ 474 MOVQ CX, 16(SP); \ 475 MOVQ BX, 24(SP); \ 476 CALL runtime·callwritebarrier(SB); \ 477 RET 478 479 CALLFN(·call32, 32) 480 CALLFN(·call64, 64) 481 CALLFN(·call128, 128) 482 CALLFN(·call256, 256) 483 CALLFN(·call512, 512) 484 CALLFN(·call1024, 1024) 485 CALLFN(·call2048, 2048) 486 CALLFN(·call4096, 4096) 487 CALLFN(·call8192, 8192) 488 CALLFN(·call16384, 16384) 489 CALLFN(·call32768, 32768) 490 CALLFN(·call65536, 65536) 491 CALLFN(·call131072, 131072) 492 CALLFN(·call262144, 262144) 493 CALLFN(·call524288, 524288) 494 CALLFN(·call1048576, 1048576) 495 CALLFN(·call2097152, 2097152) 496 CALLFN(·call4194304, 4194304) 497 CALLFN(·call8388608, 8388608) 498 CALLFN(·call16777216, 16777216) 499 CALLFN(·call33554432, 33554432) 500 CALLFN(·call67108864, 67108864) 501 CALLFN(·call134217728, 134217728) 502 CALLFN(·call268435456, 268435456) 503 CALLFN(·call536870912, 536870912) 504 CALLFN(·call1073741824, 1073741824) 505 506 TEXT runtime·procyield(SB),NOSPLIT,$0-0 507 MOVL cycles+0(FP), AX 508 again: 509 PAUSE 510 SUBL $1, AX 511 JNZ again 512 RET 513 514 515 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 516 // Stores are already ordered on x86, so this is just a 517 // compile barrier. 518 RET 519 520 // void jmpdefer(fn, sp); 521 // called from deferreturn. 522 // 1. pop the caller 523 // 2. sub 5 bytes from the callers return 524 // 3. jmp to the argument 525 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 526 MOVQ fv+0(FP), DX // fn 527 MOVQ argp+8(FP), BX // caller sp 528 LEAQ -8(BX), SP // caller sp after CALL 529 MOVQ -8(SP), BP // restore BP as if deferreturn returned (harmless if framepointers not in use) 530 SUBQ $5, (SP) // return to CALL again 531 MOVQ 0(DX), BX 532 JMP BX // but first run the deferred function 533 534 // Save state of caller into g->sched. Smashes R8, R9. 535 TEXT gosave<>(SB),NOSPLIT,$0 536 get_tls(R8) 537 MOVQ g(R8), R8 538 MOVQ 0(SP), R9 539 MOVQ R9, (g_sched+gobuf_pc)(R8) 540 LEAQ 8(SP), R9 541 MOVQ R9, (g_sched+gobuf_sp)(R8) 542 MOVQ $0, (g_sched+gobuf_ret)(R8) 543 MOVQ $0, (g_sched+gobuf_ctxt)(R8) 544 MOVQ BP, (g_sched+gobuf_bp)(R8) 545 RET 546 547 // func asmcgocall(fn, arg unsafe.Pointer) int32 548 // Call fn(arg) on the scheduler stack, 549 // aligned appropriately for the gcc ABI. 550 // See cgocall.go for more details. 551 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 552 MOVQ fn+0(FP), AX 553 MOVQ arg+8(FP), BX 554 555 MOVQ SP, DX 556 557 // Figure out if we need to switch to m->g0 stack. 558 // We get called to create new OS threads too, and those 559 // come in on the m->g0 stack already. 560 get_tls(CX) 561 MOVQ g(CX), R8 562 CMPQ R8, $0 563 JEQ nosave 564 MOVQ g_m(R8), R8 565 MOVQ m_g0(R8), SI 566 MOVQ g(CX), DI 567 CMPQ SI, DI 568 JEQ nosave 569 MOVQ m_gsignal(R8), SI 570 CMPQ SI, DI 571 JEQ nosave 572 573 // Switch to system stack. 574 MOVQ m_g0(R8), SI 575 CALL gosave<>(SB) 576 MOVQ SI, g(CX) 577 MOVQ (g_sched+gobuf_sp)(SI), SP 578 579 // Now on a scheduling stack (a pthread-created stack). 580 // Make sure we have enough room for 4 stack-backed fast-call 581 // registers as per windows amd64 calling convention. 582 SUBQ $64, SP 583 ANDQ $~15, SP // alignment for gcc ABI 584 MOVQ DI, 48(SP) // save g 585 MOVQ (g_stack+stack_hi)(DI), DI 586 SUBQ DX, DI 587 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 588 MOVQ BX, DI // DI = first argument in AMD64 ABI 589 MOVQ BX, CX // CX = first argument in Win64 590 CALL AX 591 592 // Restore registers, g, stack pointer. 593 get_tls(CX) 594 MOVQ 48(SP), DI 595 MOVQ (g_stack+stack_hi)(DI), SI 596 SUBQ 40(SP), SI 597 MOVQ DI, g(CX) 598 MOVQ SI, SP 599 600 MOVL AX, ret+16(FP) 601 RET 602 603 nosave: 604 // Running on a system stack, perhaps even without a g. 605 // Having no g can happen during thread creation or thread teardown 606 // (see needm/dropm on Solaris, for example). 607 // This code is like the above sequence but without saving/restoring g 608 // and without worrying about the stack moving out from under us 609 // (because we're on a system stack, not a goroutine stack). 610 // The above code could be used directly if already on a system stack, 611 // but then the only path through this code would be a rare case on Solaris. 612 // Using this code for all "already on system stack" calls exercises it more, 613 // which should help keep it correct. 614 SUBQ $64, SP 615 ANDQ $~15, SP 616 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging 617 MOVQ DX, 40(SP) // save original stack pointer 618 MOVQ BX, DI // DI = first argument in AMD64 ABI 619 MOVQ BX, CX // CX = first argument in Win64 620 CALL AX 621 MOVQ 40(SP), SI // restore original stack pointer 622 MOVQ SI, SP 623 MOVL AX, ret+16(FP) 624 RET 625 626 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt) 627 // Turn the fn into a Go func (by taking its address) and call 628 // cgocallback_gofunc. 629 TEXT runtime·cgocallback(SB),NOSPLIT,$32-32 630 LEAQ fn+0(FP), AX 631 MOVQ AX, 0(SP) 632 MOVQ frame+8(FP), AX 633 MOVQ AX, 8(SP) 634 MOVQ framesize+16(FP), AX 635 MOVQ AX, 16(SP) 636 MOVQ ctxt+24(FP), AX 637 MOVQ AX, 24(SP) 638 MOVQ $runtime·cgocallback_gofunc(SB), AX 639 CALL AX 640 RET 641 642 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt) 643 // See cgocall.go for more details. 644 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32 645 NO_LOCAL_POINTERS 646 647 // If g is nil, Go did not create the current thread. 648 // Call needm to obtain one m for temporary use. 649 // In this case, we're running on the thread stack, so there's 650 // lots of space, but the linker doesn't know. Hide the call from 651 // the linker analysis by using an indirect call through AX. 652 get_tls(CX) 653 #ifdef GOOS_windows 654 MOVL $0, BX 655 CMPQ CX, $0 656 JEQ 2(PC) 657 #endif 658 MOVQ g(CX), BX 659 CMPQ BX, $0 660 JEQ needm 661 MOVQ g_m(BX), BX 662 MOVQ BX, R8 // holds oldm until end of function 663 JMP havem 664 needm: 665 MOVQ $0, 0(SP) 666 MOVQ $runtime·needm(SB), AX 667 CALL AX 668 MOVQ 0(SP), R8 669 get_tls(CX) 670 MOVQ g(CX), BX 671 MOVQ g_m(BX), BX 672 673 // Set m->sched.sp = SP, so that if a panic happens 674 // during the function we are about to execute, it will 675 // have a valid SP to run on the g0 stack. 676 // The next few lines (after the havem label) 677 // will save this SP onto the stack and then write 678 // the same SP back to m->sched.sp. That seems redundant, 679 // but if an unrecovered panic happens, unwindm will 680 // restore the g->sched.sp from the stack location 681 // and then systemstack will try to use it. If we don't set it here, 682 // that restored SP will be uninitialized (typically 0) and 683 // will not be usable. 684 MOVQ m_g0(BX), SI 685 MOVQ SP, (g_sched+gobuf_sp)(SI) 686 687 havem: 688 // Now there's a valid m, and we're running on its m->g0. 689 // Save current m->g0->sched.sp on stack and then set it to SP. 690 // Save current sp in m->g0->sched.sp in preparation for 691 // switch back to m->curg stack. 692 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 693 MOVQ m_g0(BX), SI 694 MOVQ (g_sched+gobuf_sp)(SI), AX 695 MOVQ AX, 0(SP) 696 MOVQ SP, (g_sched+gobuf_sp)(SI) 697 698 // Switch to m->curg stack and call runtime.cgocallbackg. 699 // Because we are taking over the execution of m->curg 700 // but *not* resuming what had been running, we need to 701 // save that information (m->curg->sched) so we can restore it. 702 // We can restore m->curg->sched.sp easily, because calling 703 // runtime.cgocallbackg leaves SP unchanged upon return. 704 // To save m->curg->sched.pc, we push it onto the stack. 705 // This has the added benefit that it looks to the traceback 706 // routine like cgocallbackg is going to return to that 707 // PC (because the frame we allocate below has the same 708 // size as cgocallback_gofunc's frame declared above) 709 // so that the traceback will seamlessly trace back into 710 // the earlier calls. 711 // 712 // In the new goroutine, 8(SP) holds the saved R8. 713 MOVQ m_curg(BX), SI 714 MOVQ SI, g(CX) 715 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 716 MOVQ (g_sched+gobuf_pc)(SI), BX 717 MOVQ BX, -8(DI) 718 // Compute the size of the frame, including return PC and, if 719 // GOEXPERIMENT=framepointer, the saved based pointer 720 MOVQ ctxt+24(FP), BX 721 LEAQ fv+0(FP), AX 722 SUBQ SP, AX 723 SUBQ AX, DI 724 MOVQ DI, SP 725 726 MOVQ R8, 8(SP) 727 MOVQ BX, 0(SP) 728 CALL runtime·cgocallbackg(SB) 729 MOVQ 8(SP), R8 730 731 // Compute the size of the frame again. FP and SP have 732 // completely different values here than they did above, 733 // but only their difference matters. 734 LEAQ fv+0(FP), AX 735 SUBQ SP, AX 736 737 // Restore g->sched (== m->curg->sched) from saved values. 738 get_tls(CX) 739 MOVQ g(CX), SI 740 MOVQ SP, DI 741 ADDQ AX, DI 742 MOVQ -8(DI), BX 743 MOVQ BX, (g_sched+gobuf_pc)(SI) 744 MOVQ DI, (g_sched+gobuf_sp)(SI) 745 746 // Switch back to m->g0's stack and restore m->g0->sched.sp. 747 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 748 // so we do not have to restore it.) 749 MOVQ g(CX), BX 750 MOVQ g_m(BX), BX 751 MOVQ m_g0(BX), SI 752 MOVQ SI, g(CX) 753 MOVQ (g_sched+gobuf_sp)(SI), SP 754 MOVQ 0(SP), AX 755 MOVQ AX, (g_sched+gobuf_sp)(SI) 756 757 // If the m on entry was nil, we called needm above to borrow an m 758 // for the duration of the call. Since the call is over, return it with dropm. 759 CMPQ R8, $0 760 JNE 3(PC) 761 MOVQ $runtime·dropm(SB), AX 762 CALL AX 763 764 // Done! 765 RET 766 767 // void setg(G*); set g. for use by needm. 768 TEXT runtime·setg(SB), NOSPLIT, $0-8 769 MOVQ gg+0(FP), BX 770 #ifdef GOOS_windows 771 CMPQ BX, $0 772 JNE settls 773 MOVQ $0, 0x28(GS) 774 RET 775 settls: 776 MOVQ g_m(BX), AX 777 LEAQ m_tls(AX), AX 778 MOVQ AX, 0x28(GS) 779 #endif 780 get_tls(CX) 781 MOVQ BX, g(CX) 782 RET 783 784 // void setg_gcc(G*); set g called from gcc. 785 TEXT setg_gcc<>(SB),NOSPLIT,$0 786 get_tls(AX) 787 MOVQ DI, g(AX) 788 RET 789 790 // check that SP is in range [g->stack.lo, g->stack.hi) 791 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 792 get_tls(CX) 793 MOVQ g(CX), AX 794 CMPQ (g_stack+stack_hi)(AX), SP 795 JHI 2(PC) 796 INT $3 797 CMPQ SP, (g_stack+stack_lo)(AX) 798 JHI 2(PC) 799 INT $3 800 RET 801 802 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 803 MOVQ argp+0(FP),AX // addr of first arg 804 MOVQ -8(AX),AX // get calling pc 805 CMPQ AX, runtime·stackBarrierPC(SB) 806 JNE nobar 807 // Get original return PC. 808 CALL runtime·nextBarrierPC(SB) 809 MOVQ 0(SP), AX 810 nobar: 811 MOVQ AX, ret+8(FP) 812 RET 813 814 TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16 815 MOVQ argp+0(FP),AX // addr of first arg 816 MOVQ pc+8(FP), BX 817 MOVQ -8(AX), CX 818 CMPQ CX, runtime·stackBarrierPC(SB) 819 JEQ setbar 820 MOVQ BX, -8(AX) // set calling pc 821 RET 822 setbar: 823 // Set the stack barrier return PC. 824 MOVQ BX, 0(SP) 825 CALL runtime·setNextBarrierPC(SB) 826 RET 827 828 TEXT runtime·getcallersp(SB),NOSPLIT,$0-16 829 MOVQ argp+0(FP), AX 830 MOVQ AX, ret+8(FP) 831 RET 832 833 // func cputicks() int64 834 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 835 CMPB runtime·lfenceBeforeRdtsc(SB), $1 836 JNE mfence 837 LFENCE 838 JMP done 839 mfence: 840 MFENCE 841 done: 842 RDTSC 843 SHLQ $32, DX 844 ADDQ DX, AX 845 MOVQ AX, ret+0(FP) 846 RET 847 848 // memhash_varlen(p unsafe.Pointer, h seed) uintptr 849 // redirects to memhash(p, h, size) using the size 850 // stored in the closure. 851 TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24 852 GO_ARGS 853 NO_LOCAL_POINTERS 854 MOVQ p+0(FP), AX 855 MOVQ h+8(FP), BX 856 MOVQ 8(DX), CX 857 MOVQ AX, 0(SP) 858 MOVQ BX, 8(SP) 859 MOVQ CX, 16(SP) 860 CALL runtime·memhash(SB) 861 MOVQ 24(SP), AX 862 MOVQ AX, ret+16(FP) 863 RET 864 865 // hash function using AES hardware instructions 866 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 867 MOVQ p+0(FP), AX // ptr to data 868 MOVQ s+16(FP), CX // size 869 LEAQ ret+24(FP), DX 870 JMP runtime·aeshashbody(SB) 871 872 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 873 MOVQ p+0(FP), AX // ptr to string struct 874 MOVQ 8(AX), CX // length of string 875 MOVQ (AX), AX // string data 876 LEAQ ret+16(FP), DX 877 JMP runtime·aeshashbody(SB) 878 879 // AX: data 880 // CX: length 881 // DX: address to put return value 882 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 883 // Fill an SSE register with our seeds. 884 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 885 PINSRW $4, CX, X0 // 16 bits of length 886 PSHUFHW $0, X0, X0 // repeat length 4 times total 887 MOVO X0, X1 // save unscrambled seed 888 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 889 AESENC X0, X0 // scramble seed 890 891 CMPQ CX, $16 892 JB aes0to15 893 JE aes16 894 CMPQ CX, $32 895 JBE aes17to32 896 CMPQ CX, $64 897 JBE aes33to64 898 CMPQ CX, $128 899 JBE aes65to128 900 JMP aes129plus 901 902 aes0to15: 903 TESTQ CX, CX 904 JE aes0 905 906 ADDQ $16, AX 907 TESTW $0xff0, AX 908 JE endofpage 909 910 // 16 bytes loaded at this address won't cross 911 // a page boundary, so we can load it directly. 912 MOVOU -16(AX), X1 913 ADDQ CX, CX 914 MOVQ $masks<>(SB), AX 915 PAND (AX)(CX*8), X1 916 final1: 917 PXOR X0, X1 // xor data with seed 918 AESENC X1, X1 // scramble combo 3 times 919 AESENC X1, X1 920 AESENC X1, X1 921 MOVQ X1, (DX) 922 RET 923 924 endofpage: 925 // address ends in 1111xxxx. Might be up against 926 // a page boundary, so load ending at last byte. 927 // Then shift bytes down using pshufb. 928 MOVOU -32(AX)(CX*1), X1 929 ADDQ CX, CX 930 MOVQ $shifts<>(SB), AX 931 PSHUFB (AX)(CX*8), X1 932 JMP final1 933 934 aes0: 935 // Return scrambled input seed 936 AESENC X0, X0 937 MOVQ X0, (DX) 938 RET 939 940 aes16: 941 MOVOU (AX), X1 942 JMP final1 943 944 aes17to32: 945 // make second starting seed 946 PXOR runtime·aeskeysched+16(SB), X1 947 AESENC X1, X1 948 949 // load data to be hashed 950 MOVOU (AX), X2 951 MOVOU -16(AX)(CX*1), X3 952 953 // xor with seed 954 PXOR X0, X2 955 PXOR X1, X3 956 957 // scramble 3 times 958 AESENC X2, X2 959 AESENC X3, X3 960 AESENC X2, X2 961 AESENC X3, X3 962 AESENC X2, X2 963 AESENC X3, X3 964 965 // combine results 966 PXOR X3, X2 967 MOVQ X2, (DX) 968 RET 969 970 aes33to64: 971 // make 3 more starting seeds 972 MOVO X1, X2 973 MOVO X1, X3 974 PXOR runtime·aeskeysched+16(SB), X1 975 PXOR runtime·aeskeysched+32(SB), X2 976 PXOR runtime·aeskeysched+48(SB), X3 977 AESENC X1, X1 978 AESENC X2, X2 979 AESENC X3, X3 980 981 MOVOU (AX), X4 982 MOVOU 16(AX), X5 983 MOVOU -32(AX)(CX*1), X6 984 MOVOU -16(AX)(CX*1), X7 985 986 PXOR X0, X4 987 PXOR X1, X5 988 PXOR X2, X6 989 PXOR X3, X7 990 991 AESENC X4, X4 992 AESENC X5, X5 993 AESENC X6, X6 994 AESENC X7, X7 995 996 AESENC X4, X4 997 AESENC X5, X5 998 AESENC X6, X6 999 AESENC X7, X7 1000 1001 AESENC X4, X4 1002 AESENC X5, X5 1003 AESENC X6, X6 1004 AESENC X7, X7 1005 1006 PXOR X6, X4 1007 PXOR X7, X5 1008 PXOR X5, X4 1009 MOVQ X4, (DX) 1010 RET 1011 1012 aes65to128: 1013 // make 7 more starting seeds 1014 MOVO X1, X2 1015 MOVO X1, X3 1016 MOVO X1, X4 1017 MOVO X1, X5 1018 MOVO X1, X6 1019 MOVO X1, X7 1020 PXOR runtime·aeskeysched+16(SB), X1 1021 PXOR runtime·aeskeysched+32(SB), X2 1022 PXOR runtime·aeskeysched+48(SB), X3 1023 PXOR runtime·aeskeysched+64(SB), X4 1024 PXOR runtime·aeskeysched+80(SB), X5 1025 PXOR runtime·aeskeysched+96(SB), X6 1026 PXOR runtime·aeskeysched+112(SB), X7 1027 AESENC X1, X1 1028 AESENC X2, X2 1029 AESENC X3, X3 1030 AESENC X4, X4 1031 AESENC X5, X5 1032 AESENC X6, X6 1033 AESENC X7, X7 1034 1035 // load data 1036 MOVOU (AX), X8 1037 MOVOU 16(AX), X9 1038 MOVOU 32(AX), X10 1039 MOVOU 48(AX), X11 1040 MOVOU -64(AX)(CX*1), X12 1041 MOVOU -48(AX)(CX*1), X13 1042 MOVOU -32(AX)(CX*1), X14 1043 MOVOU -16(AX)(CX*1), X15 1044 1045 // xor with seed 1046 PXOR X0, X8 1047 PXOR X1, X9 1048 PXOR X2, X10 1049 PXOR X3, X11 1050 PXOR X4, X12 1051 PXOR X5, X13 1052 PXOR X6, X14 1053 PXOR X7, X15 1054 1055 // scramble 3 times 1056 AESENC X8, X8 1057 AESENC X9, X9 1058 AESENC X10, X10 1059 AESENC X11, X11 1060 AESENC X12, X12 1061 AESENC X13, X13 1062 AESENC X14, X14 1063 AESENC X15, X15 1064 1065 AESENC X8, X8 1066 AESENC X9, X9 1067 AESENC X10, X10 1068 AESENC X11, X11 1069 AESENC X12, X12 1070 AESENC X13, X13 1071 AESENC X14, X14 1072 AESENC X15, X15 1073 1074 AESENC X8, X8 1075 AESENC X9, X9 1076 AESENC X10, X10 1077 AESENC X11, X11 1078 AESENC X12, X12 1079 AESENC X13, X13 1080 AESENC X14, X14 1081 AESENC X15, X15 1082 1083 // combine results 1084 PXOR X12, X8 1085 PXOR X13, X9 1086 PXOR X14, X10 1087 PXOR X15, X11 1088 PXOR X10, X8 1089 PXOR X11, X9 1090 PXOR X9, X8 1091 MOVQ X8, (DX) 1092 RET 1093 1094 aes129plus: 1095 // make 7 more starting seeds 1096 MOVO X1, X2 1097 MOVO X1, X3 1098 MOVO X1, X4 1099 MOVO X1, X5 1100 MOVO X1, X6 1101 MOVO X1, X7 1102 PXOR runtime·aeskeysched+16(SB), X1 1103 PXOR runtime·aeskeysched+32(SB), X2 1104 PXOR runtime·aeskeysched+48(SB), X3 1105 PXOR runtime·aeskeysched+64(SB), X4 1106 PXOR runtime·aeskeysched+80(SB), X5 1107 PXOR runtime·aeskeysched+96(SB), X6 1108 PXOR runtime·aeskeysched+112(SB), X7 1109 AESENC X1, X1 1110 AESENC X2, X2 1111 AESENC X3, X3 1112 AESENC X4, X4 1113 AESENC X5, X5 1114 AESENC X6, X6 1115 AESENC X7, X7 1116 1117 // start with last (possibly overlapping) block 1118 MOVOU -128(AX)(CX*1), X8 1119 MOVOU -112(AX)(CX*1), X9 1120 MOVOU -96(AX)(CX*1), X10 1121 MOVOU -80(AX)(CX*1), X11 1122 MOVOU -64(AX)(CX*1), X12 1123 MOVOU -48(AX)(CX*1), X13 1124 MOVOU -32(AX)(CX*1), X14 1125 MOVOU -16(AX)(CX*1), X15 1126 1127 // xor in seed 1128 PXOR X0, X8 1129 PXOR X1, X9 1130 PXOR X2, X10 1131 PXOR X3, X11 1132 PXOR X4, X12 1133 PXOR X5, X13 1134 PXOR X6, X14 1135 PXOR X7, X15 1136 1137 // compute number of remaining 128-byte blocks 1138 DECQ CX 1139 SHRQ $7, CX 1140 1141 aesloop: 1142 // scramble state 1143 AESENC X8, X8 1144 AESENC X9, X9 1145 AESENC X10, X10 1146 AESENC X11, X11 1147 AESENC X12, X12 1148 AESENC X13, X13 1149 AESENC X14, X14 1150 AESENC X15, X15 1151 1152 // scramble state, xor in a block 1153 MOVOU (AX), X0 1154 MOVOU 16(AX), X1 1155 MOVOU 32(AX), X2 1156 MOVOU 48(AX), X3 1157 AESENC X0, X8 1158 AESENC X1, X9 1159 AESENC X2, X10 1160 AESENC X3, X11 1161 MOVOU 64(AX), X4 1162 MOVOU 80(AX), X5 1163 MOVOU 96(AX), X6 1164 MOVOU 112(AX), X7 1165 AESENC X4, X12 1166 AESENC X5, X13 1167 AESENC X6, X14 1168 AESENC X7, X15 1169 1170 ADDQ $128, AX 1171 DECQ CX 1172 JNE aesloop 1173 1174 // 3 more scrambles to finish 1175 AESENC X8, X8 1176 AESENC X9, X9 1177 AESENC X10, X10 1178 AESENC X11, X11 1179 AESENC X12, X12 1180 AESENC X13, X13 1181 AESENC X14, X14 1182 AESENC X15, X15 1183 AESENC X8, X8 1184 AESENC X9, X9 1185 AESENC X10, X10 1186 AESENC X11, X11 1187 AESENC X12, X12 1188 AESENC X13, X13 1189 AESENC X14, X14 1190 AESENC X15, X15 1191 AESENC X8, X8 1192 AESENC X9, X9 1193 AESENC X10, X10 1194 AESENC X11, X11 1195 AESENC X12, X12 1196 AESENC X13, X13 1197 AESENC X14, X14 1198 AESENC X15, X15 1199 1200 PXOR X12, X8 1201 PXOR X13, X9 1202 PXOR X14, X10 1203 PXOR X15, X11 1204 PXOR X10, X8 1205 PXOR X11, X9 1206 PXOR X9, X8 1207 MOVQ X8, (DX) 1208 RET 1209 1210 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1211 MOVQ p+0(FP), AX // ptr to data 1212 MOVQ h+8(FP), X0 // seed 1213 PINSRD $2, (AX), X0 // data 1214 AESENC runtime·aeskeysched+0(SB), X0 1215 AESENC runtime·aeskeysched+16(SB), X0 1216 AESENC runtime·aeskeysched+32(SB), X0 1217 MOVQ X0, ret+16(FP) 1218 RET 1219 1220 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1221 MOVQ p+0(FP), AX // ptr to data 1222 MOVQ h+8(FP), X0 // seed 1223 PINSRQ $1, (AX), X0 // data 1224 AESENC runtime·aeskeysched+0(SB), X0 1225 AESENC runtime·aeskeysched+16(SB), X0 1226 AESENC runtime·aeskeysched+32(SB), X0 1227 MOVQ X0, ret+16(FP) 1228 RET 1229 1230 // simple mask to get rid of data in the high part of the register. 1231 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1232 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1233 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1234 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1235 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1236 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1237 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1238 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1239 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1240 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1241 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1242 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1243 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1244 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1245 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1246 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1247 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1248 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1249 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1250 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1251 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1252 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1253 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1254 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1255 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1256 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1257 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1258 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1259 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1260 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1261 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1262 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1263 GLOBL masks<>(SB),RODATA,$256 1264 1265 TEXT ·checkASM(SB),NOSPLIT,$0-1 1266 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte 1267 MOVQ $masks<>(SB), AX 1268 MOVQ $shifts<>(SB), BX 1269 ORQ BX, AX 1270 TESTQ $15, AX 1271 SETEQ ret+0(FP) 1272 RET 1273 1274 // these are arguments to pshufb. They move data down from 1275 // the high bytes of the register to the low bytes of the register. 1276 // index is how many bytes to move. 1277 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1278 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1279 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1280 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1281 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1282 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1283 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1284 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1285 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1286 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1287 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1288 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1289 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1290 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1291 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1292 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1293 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1294 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1295 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1296 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1297 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1298 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1299 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1300 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1301 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1302 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1303 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1304 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1305 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1306 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1307 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1308 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1309 GLOBL shifts<>(SB),RODATA,$256 1310 1311 // memequal(p, q unsafe.Pointer, size uintptr) bool 1312 TEXT runtime·memequal(SB),NOSPLIT,$0-25 1313 MOVQ a+0(FP), SI 1314 MOVQ b+8(FP), DI 1315 CMPQ SI, DI 1316 JEQ eq 1317 MOVQ size+16(FP), BX 1318 LEAQ ret+24(FP), AX 1319 JMP runtime·memeqbody(SB) 1320 eq: 1321 MOVB $1, ret+24(FP) 1322 RET 1323 1324 // memequal_varlen(a, b unsafe.Pointer) bool 1325 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1326 MOVQ a+0(FP), SI 1327 MOVQ b+8(FP), DI 1328 CMPQ SI, DI 1329 JEQ eq 1330 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1331 LEAQ ret+16(FP), AX 1332 JMP runtime·memeqbody(SB) 1333 eq: 1334 MOVB $1, ret+16(FP) 1335 RET 1336 1337 // eqstring tests whether two strings are equal. 1338 // The compiler guarantees that strings passed 1339 // to eqstring have equal length. 1340 // See runtime_test.go:eqstring_generic for 1341 // equivalent Go code. 1342 TEXT runtime·eqstring(SB),NOSPLIT,$0-33 1343 MOVQ s1_base+0(FP), SI 1344 MOVQ s2_base+16(FP), DI 1345 CMPQ SI, DI 1346 JEQ eq 1347 MOVQ s1_len+8(FP), BX 1348 LEAQ ret+32(FP), AX 1349 JMP runtime·memeqbody(SB) 1350 eq: 1351 MOVB $1, ret+32(FP) 1352 RET 1353 1354 // a in SI 1355 // b in DI 1356 // count in BX 1357 // address of result byte in AX 1358 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1359 CMPQ BX, $8 1360 JB small 1361 CMPQ BX, $64 1362 JB bigloop 1363 CMPB runtime·support_avx2(SB), $1 1364 JE hugeloop_avx2 1365 1366 // 64 bytes at a time using xmm registers 1367 hugeloop: 1368 CMPQ BX, $64 1369 JB bigloop 1370 MOVOU (SI), X0 1371 MOVOU (DI), X1 1372 MOVOU 16(SI), X2 1373 MOVOU 16(DI), X3 1374 MOVOU 32(SI), X4 1375 MOVOU 32(DI), X5 1376 MOVOU 48(SI), X6 1377 MOVOU 48(DI), X7 1378 PCMPEQB X1, X0 1379 PCMPEQB X3, X2 1380 PCMPEQB X5, X4 1381 PCMPEQB X7, X6 1382 PAND X2, X0 1383 PAND X6, X4 1384 PAND X4, X0 1385 PMOVMSKB X0, DX 1386 ADDQ $64, SI 1387 ADDQ $64, DI 1388 SUBQ $64, BX 1389 CMPL DX, $0xffff 1390 JEQ hugeloop 1391 MOVB $0, (AX) 1392 RET 1393 1394 // 64 bytes at a time using ymm registers 1395 hugeloop_avx2: 1396 CMPQ BX, $64 1397 JB bigloop_avx2 1398 VMOVDQU (SI), Y0 1399 VMOVDQU (DI), Y1 1400 VMOVDQU 32(SI), Y2 1401 VMOVDQU 32(DI), Y3 1402 VPCMPEQB Y1, Y0, Y4 1403 VPCMPEQB Y2, Y3, Y5 1404 VPAND Y4, Y5, Y6 1405 VPMOVMSKB Y6, DX 1406 ADDQ $64, SI 1407 ADDQ $64, DI 1408 SUBQ $64, BX 1409 CMPL DX, $0xffffffff 1410 JEQ hugeloop_avx2 1411 VZEROUPPER 1412 MOVB $0, (AX) 1413 RET 1414 1415 bigloop_avx2: 1416 VZEROUPPER 1417 1418 // 8 bytes at a time using 64-bit register 1419 bigloop: 1420 CMPQ BX, $8 1421 JBE leftover 1422 MOVQ (SI), CX 1423 MOVQ (DI), DX 1424 ADDQ $8, SI 1425 ADDQ $8, DI 1426 SUBQ $8, BX 1427 CMPQ CX, DX 1428 JEQ bigloop 1429 MOVB $0, (AX) 1430 RET 1431 1432 // remaining 0-8 bytes 1433 leftover: 1434 MOVQ -8(SI)(BX*1), CX 1435 MOVQ -8(DI)(BX*1), DX 1436 CMPQ CX, DX 1437 SETEQ (AX) 1438 RET 1439 1440 small: 1441 CMPQ BX, $0 1442 JEQ equal 1443 1444 LEAQ 0(BX*8), CX 1445 NEGQ CX 1446 1447 CMPB SI, $0xf8 1448 JA si_high 1449 1450 // load at SI won't cross a page boundary. 1451 MOVQ (SI), SI 1452 JMP si_finish 1453 si_high: 1454 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1455 MOVQ -8(SI)(BX*1), SI 1456 SHRQ CX, SI 1457 si_finish: 1458 1459 // same for DI. 1460 CMPB DI, $0xf8 1461 JA di_high 1462 MOVQ (DI), DI 1463 JMP di_finish 1464 di_high: 1465 MOVQ -8(DI)(BX*1), DI 1466 SHRQ CX, DI 1467 di_finish: 1468 1469 SUBQ SI, DI 1470 SHLQ CX, DI 1471 equal: 1472 SETEQ (AX) 1473 RET 1474 1475 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1476 MOVQ s1_base+0(FP), SI 1477 MOVQ s1_len+8(FP), BX 1478 MOVQ s2_base+16(FP), DI 1479 MOVQ s2_len+24(FP), DX 1480 LEAQ ret+32(FP), R9 1481 JMP runtime·cmpbody(SB) 1482 1483 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1484 MOVQ s1+0(FP), SI 1485 MOVQ s1+8(FP), BX 1486 MOVQ s2+24(FP), DI 1487 MOVQ s2+32(FP), DX 1488 LEAQ res+48(FP), R9 1489 JMP runtime·cmpbody(SB) 1490 1491 // input: 1492 // SI = a 1493 // DI = b 1494 // BX = alen 1495 // DX = blen 1496 // R9 = address of output word (stores -1/0/1 here) 1497 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1498 CMPQ SI, DI 1499 JEQ allsame 1500 CMPQ BX, DX 1501 MOVQ DX, R8 1502 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1503 CMPQ R8, $8 1504 JB small 1505 1506 CMPQ R8, $63 1507 JBE loop 1508 CMPB runtime·support_avx2(SB), $1 1509 JEQ big_loop_avx2 1510 JMP big_loop 1511 loop: 1512 CMPQ R8, $16 1513 JBE _0through16 1514 MOVOU (SI), X0 1515 MOVOU (DI), X1 1516 PCMPEQB X0, X1 1517 PMOVMSKB X1, AX 1518 XORQ $0xffff, AX // convert EQ to NE 1519 JNE diff16 // branch if at least one byte is not equal 1520 ADDQ $16, SI 1521 ADDQ $16, DI 1522 SUBQ $16, R8 1523 JMP loop 1524 1525 diff64: 1526 ADDQ $48, SI 1527 ADDQ $48, DI 1528 JMP diff16 1529 diff48: 1530 ADDQ $32, SI 1531 ADDQ $32, DI 1532 JMP diff16 1533 diff32: 1534 ADDQ $16, SI 1535 ADDQ $16, DI 1536 // AX = bit mask of differences 1537 diff16: 1538 BSFQ AX, BX // index of first byte that differs 1539 XORQ AX, AX 1540 MOVB (SI)(BX*1), CX 1541 CMPB CX, (DI)(BX*1) 1542 SETHI AX 1543 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1544 MOVQ AX, (R9) 1545 RET 1546 1547 // 0 through 16 bytes left, alen>=8, blen>=8 1548 _0through16: 1549 CMPQ R8, $8 1550 JBE _0through8 1551 MOVQ (SI), AX 1552 MOVQ (DI), CX 1553 CMPQ AX, CX 1554 JNE diff8 1555 _0through8: 1556 MOVQ -8(SI)(R8*1), AX 1557 MOVQ -8(DI)(R8*1), CX 1558 CMPQ AX, CX 1559 JEQ allsame 1560 1561 // AX and CX contain parts of a and b that differ. 1562 diff8: 1563 BSWAPQ AX // reverse order of bytes 1564 BSWAPQ CX 1565 XORQ AX, CX 1566 BSRQ CX, CX // index of highest bit difference 1567 SHRQ CX, AX // move a's bit to bottom 1568 ANDQ $1, AX // mask bit 1569 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1570 MOVQ AX, (R9) 1571 RET 1572 1573 // 0-7 bytes in common 1574 small: 1575 LEAQ (R8*8), CX // bytes left -> bits left 1576 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1577 JEQ allsame 1578 1579 // load bytes of a into high bytes of AX 1580 CMPB SI, $0xf8 1581 JA si_high 1582 MOVQ (SI), SI 1583 JMP si_finish 1584 si_high: 1585 MOVQ -8(SI)(R8*1), SI 1586 SHRQ CX, SI 1587 si_finish: 1588 SHLQ CX, SI 1589 1590 // load bytes of b in to high bytes of BX 1591 CMPB DI, $0xf8 1592 JA di_high 1593 MOVQ (DI), DI 1594 JMP di_finish 1595 di_high: 1596 MOVQ -8(DI)(R8*1), DI 1597 SHRQ CX, DI 1598 di_finish: 1599 SHLQ CX, DI 1600 1601 BSWAPQ SI // reverse order of bytes 1602 BSWAPQ DI 1603 XORQ SI, DI // find bit differences 1604 JEQ allsame 1605 BSRQ DI, CX // index of highest bit difference 1606 SHRQ CX, SI // move a's bit to bottom 1607 ANDQ $1, SI // mask bit 1608 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1609 MOVQ AX, (R9) 1610 RET 1611 1612 allsame: 1613 XORQ AX, AX 1614 XORQ CX, CX 1615 CMPQ BX, DX 1616 SETGT AX // 1 if alen > blen 1617 SETEQ CX // 1 if alen == blen 1618 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1619 MOVQ AX, (R9) 1620 RET 1621 1622 // this works for >= 64 bytes of data. 1623 big_loop: 1624 MOVOU (SI), X0 1625 MOVOU (DI), X1 1626 PCMPEQB X0, X1 1627 PMOVMSKB X1, AX 1628 XORQ $0xffff, AX 1629 JNE diff16 1630 1631 MOVOU 16(SI), X0 1632 MOVOU 16(DI), X1 1633 PCMPEQB X0, X1 1634 PMOVMSKB X1, AX 1635 XORQ $0xffff, AX 1636 JNE diff32 1637 1638 MOVOU 32(SI), X0 1639 MOVOU 32(DI), X1 1640 PCMPEQB X0, X1 1641 PMOVMSKB X1, AX 1642 XORQ $0xffff, AX 1643 JNE diff48 1644 1645 MOVOU 48(SI), X0 1646 MOVOU 48(DI), X1 1647 PCMPEQB X0, X1 1648 PMOVMSKB X1, AX 1649 XORQ $0xffff, AX 1650 JNE diff64 1651 1652 ADDQ $64, SI 1653 ADDQ $64, DI 1654 SUBQ $64, R8 1655 CMPQ R8, $64 1656 JBE loop 1657 JMP big_loop 1658 1659 // Compare 64-bytes per loop iteration. 1660 // Loop is unrolled and uses AVX2. 1661 big_loop_avx2: 1662 VMOVDQU (SI), Y2 1663 VMOVDQU (DI), Y3 1664 VMOVDQU 32(SI), Y4 1665 VMOVDQU 32(DI), Y5 1666 VPCMPEQB Y2, Y3, Y0 1667 VPMOVMSKB Y0, AX 1668 XORL $0xffffffff, AX 1669 JNE diff32_avx2 1670 VPCMPEQB Y4, Y5, Y6 1671 VPMOVMSKB Y6, AX 1672 XORL $0xffffffff, AX 1673 JNE diff64_avx2 1674 1675 ADDQ $64, SI 1676 ADDQ $64, DI 1677 SUBQ $64, R8 1678 CMPQ R8, $64 1679 JB big_loop_avx2_exit 1680 JMP big_loop_avx2 1681 1682 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 1683 diff32_avx2: 1684 VZEROUPPER 1685 JMP diff16 1686 1687 // Same as diff32_avx2, but for last 32 bytes. 1688 diff64_avx2: 1689 VZEROUPPER 1690 JMP diff48 1691 1692 // For <64 bytes remainder jump to normal loop. 1693 big_loop_avx2_exit: 1694 VZEROUPPER 1695 JMP loop 1696 1697 1698 // TODO: Also use this in bytes.Index 1699 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 1700 MOVQ s+0(FP), DI 1701 // We want len in DX and AX, because PCMPESTRI implicitly consumes them 1702 MOVQ s_len+8(FP), DX 1703 MOVQ c+16(FP), BP 1704 MOVQ c_len+24(FP), AX 1705 CMPQ AX, DX 1706 JA fail 1707 CMPQ DX, $16 1708 JAE sse42 1709 no_sse42: 1710 CMPQ AX, $2 1711 JA _3_or_more 1712 MOVW (BP), BP 1713 LEAQ -1(DI)(DX*1), DX 1714 loop2: 1715 MOVW (DI), SI 1716 CMPW SI,BP 1717 JZ success 1718 ADDQ $1,DI 1719 CMPQ DI,DX 1720 JB loop2 1721 JMP fail 1722 _3_or_more: 1723 CMPQ AX, $3 1724 JA _4_or_more 1725 MOVW 1(BP), BX 1726 MOVW (BP), BP 1727 LEAQ -2(DI)(DX*1), DX 1728 loop3: 1729 MOVW (DI), SI 1730 CMPW SI,BP 1731 JZ partial_success3 1732 ADDQ $1,DI 1733 CMPQ DI,DX 1734 JB loop3 1735 JMP fail 1736 partial_success3: 1737 MOVW 1(DI), SI 1738 CMPW SI,BX 1739 JZ success 1740 ADDQ $1,DI 1741 CMPQ DI,DX 1742 JB loop3 1743 JMP fail 1744 _4_or_more: 1745 CMPQ AX, $4 1746 JA _5_or_more 1747 MOVL (BP), BP 1748 LEAQ -3(DI)(DX*1), DX 1749 loop4: 1750 MOVL (DI), SI 1751 CMPL SI,BP 1752 JZ success 1753 ADDQ $1,DI 1754 CMPQ DI,DX 1755 JB loop4 1756 JMP fail 1757 _5_or_more: 1758 CMPQ AX, $7 1759 JA _8_or_more 1760 LEAQ 1(DI)(DX*1), DX 1761 SUBQ AX, DX 1762 MOVL -4(BP)(AX*1), BX 1763 MOVL (BP), BP 1764 loop5to7: 1765 MOVL (DI), SI 1766 CMPL SI,BP 1767 JZ partial_success5to7 1768 ADDQ $1,DI 1769 CMPQ DI,DX 1770 JB loop5to7 1771 JMP fail 1772 partial_success5to7: 1773 MOVL -4(AX)(DI*1), SI 1774 CMPL SI,BX 1775 JZ success 1776 ADDQ $1,DI 1777 CMPQ DI,DX 1778 JB loop5to7 1779 JMP fail 1780 _8_or_more: 1781 CMPQ AX, $8 1782 JA _9_or_more 1783 MOVQ (BP), BP 1784 LEAQ -7(DI)(DX*1), DX 1785 loop8: 1786 MOVQ (DI), SI 1787 CMPQ SI,BP 1788 JZ success 1789 ADDQ $1,DI 1790 CMPQ DI,DX 1791 JB loop8 1792 JMP fail 1793 _9_or_more: 1794 CMPQ AX, $16 1795 JA _16_or_more 1796 LEAQ 1(DI)(DX*1), DX 1797 SUBQ AX, DX 1798 MOVQ -8(BP)(AX*1), BX 1799 MOVQ (BP), BP 1800 loop9to15: 1801 MOVQ (DI), SI 1802 CMPQ SI,BP 1803 JZ partial_success9to15 1804 ADDQ $1,DI 1805 CMPQ DI,DX 1806 JB loop9to15 1807 JMP fail 1808 partial_success9to15: 1809 MOVQ -8(AX)(DI*1), SI 1810 CMPQ SI,BX 1811 JZ success 1812 ADDQ $1,DI 1813 CMPQ DI,DX 1814 JB loop9to15 1815 JMP fail 1816 _16_or_more: 1817 CMPQ AX, $16 1818 JA _17_to_31 1819 MOVOU (BP), X1 1820 LEAQ -15(DI)(DX*1), DX 1821 loop16: 1822 MOVOU (DI), X2 1823 PCMPEQB X1, X2 1824 PMOVMSKB X2, SI 1825 CMPQ SI, $0xffff 1826 JE success 1827 ADDQ $1,DI 1828 CMPQ DI,DX 1829 JB loop16 1830 JMP fail 1831 _17_to_31: 1832 LEAQ 1(DI)(DX*1), DX 1833 SUBQ AX, DX 1834 MOVOU -16(BP)(AX*1), X0 1835 MOVOU (BP), X1 1836 loop17to31: 1837 MOVOU (DI), X2 1838 PCMPEQB X1,X2 1839 PMOVMSKB X2, SI 1840 CMPQ SI, $0xffff 1841 JE partial_success17to31 1842 ADDQ $1,DI 1843 CMPQ DI,DX 1844 JB loop17to31 1845 JMP fail 1846 partial_success17to31: 1847 MOVOU -16(AX)(DI*1), X3 1848 PCMPEQB X0, X3 1849 PMOVMSKB X3, SI 1850 CMPQ SI, $0xffff 1851 JE success 1852 ADDQ $1,DI 1853 CMPQ DI,DX 1854 JB loop17to31 1855 fail: 1856 MOVQ $-1, ret+32(FP) 1857 RET 1858 sse42: 1859 MOVL runtime·cpuid_ecx(SB), CX 1860 ANDL $0x100000, CX 1861 JZ no_sse42 1862 CMPQ AX, $12 1863 // PCMPESTRI is slower than normal compare, 1864 // so using it makes sense only if we advance 4+ bytes per compare 1865 // This value was determined experimentally and is the ~same 1866 // on Nehalem (first with SSE42) and Haswell. 1867 JAE _9_or_more 1868 LEAQ 16(BP), SI 1869 TESTW $0xff0, SI 1870 JEQ no_sse42 1871 MOVOU (BP), X1 1872 LEAQ -15(DI)(DX*1), SI 1873 MOVQ $16, R9 1874 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 1875 loop_sse42: 1876 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 1877 // for equality (bits 2,3 are 11) 1878 // result is not masked or inverted (bits 4,5 are 00) 1879 // and corresponds to first matching byte (bit 6 is 0) 1880 PCMPESTRI $0x0c, (DI), X1 1881 // CX == 16 means no match, 1882 // CX > R9 means partial match at the end of the string, 1883 // otherwise sep is at offset CX from X1 start 1884 CMPQ CX, R9 1885 JBE sse42_success 1886 ADDQ R9, DI 1887 CMPQ DI, SI 1888 JB loop_sse42 1889 PCMPESTRI $0x0c, -1(SI), X1 1890 CMPQ CX, R9 1891 JA fail 1892 LEAQ -1(SI), DI 1893 sse42_success: 1894 ADDQ CX, DI 1895 success: 1896 SUBQ s+0(FP), DI 1897 MOVQ DI, ret+32(FP) 1898 RET 1899 1900 1901 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 1902 MOVQ s+0(FP), SI 1903 MOVQ s_len+8(FP), BX 1904 MOVB c+24(FP), AL 1905 LEAQ ret+32(FP), R8 1906 JMP runtime·indexbytebody(SB) 1907 1908 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 1909 MOVQ s+0(FP), SI 1910 MOVQ s_len+8(FP), BX 1911 MOVB c+16(FP), AL 1912 LEAQ ret+24(FP), R8 1913 JMP runtime·indexbytebody(SB) 1914 1915 // input: 1916 // SI: data 1917 // BX: data len 1918 // AL: byte sought 1919 // R8: address to put result 1920 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 1921 // Shuffle X0 around so that each byte contains 1922 // the character we're looking for. 1923 MOVD AX, X0 1924 PUNPCKLBW X0, X0 1925 PUNPCKLBW X0, X0 1926 PSHUFL $0, X0, X0 1927 1928 CMPQ BX, $16 1929 JLT small 1930 1931 MOVQ SI, DI 1932 1933 CMPQ BX, $32 1934 JA avx2 1935 sse: 1936 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 1937 JMP sseloopentry 1938 1939 sseloop: 1940 // Move the next 16-byte chunk of the data into X1. 1941 MOVOU (DI), X1 1942 // Compare bytes in X0 to X1. 1943 PCMPEQB X0, X1 1944 // Take the top bit of each byte in X1 and put the result in DX. 1945 PMOVMSKB X1, DX 1946 // Find first set bit, if any. 1947 BSFL DX, DX 1948 JNZ ssesuccess 1949 // Advance to next block. 1950 ADDQ $16, DI 1951 sseloopentry: 1952 CMPQ DI, AX 1953 JB sseloop 1954 1955 // Search the last 16-byte chunk. This chunk may overlap with the 1956 // chunks we've already searched, but that's ok. 1957 MOVQ AX, DI 1958 MOVOU (AX), X1 1959 PCMPEQB X0, X1 1960 PMOVMSKB X1, DX 1961 BSFL DX, DX 1962 JNZ ssesuccess 1963 1964 failure: 1965 MOVQ $-1, (R8) 1966 RET 1967 1968 // We've found a chunk containing the byte. 1969 // The chunk was loaded from DI. 1970 // The index of the matching byte in the chunk is DX. 1971 // The start of the data is SI. 1972 ssesuccess: 1973 SUBQ SI, DI // Compute offset of chunk within data. 1974 ADDQ DX, DI // Add offset of byte within chunk. 1975 MOVQ DI, (R8) 1976 RET 1977 1978 // handle for lengths < 16 1979 small: 1980 TESTQ BX, BX 1981 JEQ failure 1982 1983 // Check if we'll load across a page boundary. 1984 LEAQ 16(SI), AX 1985 TESTW $0xff0, AX 1986 JEQ endofpage 1987 1988 MOVOU (SI), X1 // Load data 1989 PCMPEQB X0, X1 // Compare target byte with each byte in data. 1990 PMOVMSKB X1, DX // Move result bits to integer register. 1991 BSFL DX, DX // Find first set bit. 1992 JZ failure // No set bit, failure. 1993 CMPL DX, BX 1994 JAE failure // Match is past end of data. 1995 MOVQ DX, (R8) 1996 RET 1997 1998 endofpage: 1999 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 2000 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2001 PMOVMSKB X1, DX // Move result bits to integer register. 2002 MOVL BX, CX 2003 SHLL CX, DX 2004 SHRL $16, DX // Shift desired bits down to bottom of register. 2005 BSFL DX, DX // Find first set bit. 2006 JZ failure // No set bit, failure. 2007 MOVQ DX, (R8) 2008 RET 2009 2010 avx2: 2011 CMPB runtime·support_avx2(SB), $1 2012 JNE sse 2013 MOVD AX, X0 2014 LEAQ -32(SI)(BX*1), R11 2015 VPBROADCASTB X0, Y1 2016 avx2_loop: 2017 VMOVDQU (DI), Y2 2018 VPCMPEQB Y1, Y2, Y3 2019 VPTEST Y3, Y3 2020 JNZ avx2success 2021 ADDQ $32, DI 2022 CMPQ DI, R11 2023 JLT avx2_loop 2024 MOVQ R11, DI 2025 VMOVDQU (DI), Y2 2026 VPCMPEQB Y1, Y2, Y3 2027 VPTEST Y3, Y3 2028 JNZ avx2success 2029 VZEROUPPER 2030 MOVQ $-1, (R8) 2031 RET 2032 2033 avx2success: 2034 VPMOVMSKB Y3, DX 2035 BSFL DX, DX 2036 SUBQ SI, DI 2037 ADDQ DI, DX 2038 MOVQ DX, (R8) 2039 VZEROUPPER 2040 RET 2041 2042 TEXT bytes·Equal(SB),NOSPLIT,$0-49 2043 MOVQ a_len+8(FP), BX 2044 MOVQ b_len+32(FP), CX 2045 CMPQ BX, CX 2046 JNE eqret 2047 MOVQ a+0(FP), SI 2048 MOVQ b+24(FP), DI 2049 LEAQ ret+48(FP), AX 2050 JMP runtime·memeqbody(SB) 2051 eqret: 2052 MOVB $0, ret+48(FP) 2053 RET 2054 2055 TEXT runtime·fastrand1(SB), NOSPLIT, $0-4 2056 get_tls(CX) 2057 MOVQ g(CX), AX 2058 MOVQ g_m(AX), AX 2059 MOVL m_fastrand(AX), DX 2060 ADDL DX, DX 2061 MOVL DX, BX 2062 XORL $0x88888eef, DX 2063 CMOVLMI BX, DX 2064 MOVL DX, m_fastrand(AX) 2065 MOVL DX, ret+0(FP) 2066 RET 2067 2068 TEXT runtime·return0(SB), NOSPLIT, $0 2069 MOVL $0, AX 2070 RET 2071 2072 2073 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 2074 // Must obey the gcc calling convention. 2075 TEXT _cgo_topofstack(SB),NOSPLIT,$0 2076 get_tls(CX) 2077 MOVQ g(CX), AX 2078 MOVQ g_m(AX), AX 2079 MOVQ m_curg(AX), AX 2080 MOVQ (g_stack+stack_hi)(AX), AX 2081 RET 2082 2083 // The top-most function running on a goroutine 2084 // returns to goexit+PCQuantum. 2085 TEXT runtime·goexit(SB),NOSPLIT,$0-0 2086 BYTE $0x90 // NOP 2087 CALL runtime·goexit1(SB) // does not return 2088 // traceback from goexit1 must hit code range of goexit 2089 BYTE $0x90 // NOP 2090 2091 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8 2092 MOVQ addr+0(FP), AX 2093 PREFETCHT0 (AX) 2094 RET 2095 2096 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8 2097 MOVQ addr+0(FP), AX 2098 PREFETCHT1 (AX) 2099 RET 2100 2101 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8 2102 MOVQ addr+0(FP), AX 2103 PREFETCHT2 (AX) 2104 RET 2105 2106 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8 2107 MOVQ addr+0(FP), AX 2108 PREFETCHNTA (AX) 2109 RET 2110 2111 // This is called from .init_array and follows the platform, not Go, ABI. 2112 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 2113 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 2114 MOVQ runtime·lastmoduledatap(SB), AX 2115 MOVQ DI, moduledata_next(AX) 2116 MOVQ DI, runtime·lastmoduledatap(SB) 2117 POPQ R15 2118 RET