github.com/tidwall/go@v0.0.0-20170415222209-6694a6888b7d/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtime·g0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVQ $0, AX 30 CPUID 31 MOVQ AX, SI 32 CMPQ AX, $0 33 JE nocpuinfo 34 35 // Figure out how to serialize RDTSC. 36 // On Intel processors LFENCE is enough. AMD requires MFENCE. 37 // Don't know about the rest, so let's do MFENCE. 38 CMPL BX, $0x756E6547 // "Genu" 39 JNE notintel 40 CMPL DX, $0x49656E69 // "ineI" 41 JNE notintel 42 CMPL CX, $0x6C65746E // "ntel" 43 JNE notintel 44 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 45 notintel: 46 47 // Load EAX=1 cpuid flags 48 MOVQ $1, AX 49 CPUID 50 MOVL CX, runtime·cpuid_ecx(SB) 51 MOVL DX, runtime·cpuid_edx(SB) 52 53 // Load EAX=7/ECX=0 cpuid flags 54 CMPQ SI, $7 55 JLT no7 56 MOVL $7, AX 57 MOVL $0, CX 58 CPUID 59 MOVL BX, runtime·cpuid_ebx7(SB) 60 no7: 61 // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1] 62 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 63 // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf 64 MOVL runtime·cpuid_ecx(SB), CX 65 ANDL $0x18000000, CX // check for OSXSAVE and AVX bits 66 CMPL CX, $0x18000000 67 JNE noavx 68 MOVL $0, CX 69 // For XGETBV, OSXSAVE bit is required and sufficient 70 XGETBV 71 ANDL $6, AX 72 CMPL AX, $6 // Check for OS support of YMM registers 73 JNE noavx 74 MOVB $1, runtime·support_avx(SB) 75 TESTL $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit 76 JEQ noavx2 77 MOVB $1, runtime·support_avx2(SB) 78 JMP testbmi1 79 noavx: 80 MOVB $0, runtime·support_avx(SB) 81 noavx2: 82 MOVB $0, runtime·support_avx2(SB) 83 testbmi1: 84 // Detect BMI1 and BMI2 extensions as per 85 // 5.1.16.1 Detection of VEX-encoded GPR Instructions, 86 // LZCNT and TZCNT, PREFETCHW chapter of [1] 87 MOVB $0, runtime·support_bmi1(SB) 88 TESTL $(1<<3), runtime·cpuid_ebx7(SB) // check for BMI1 bit 89 JEQ testbmi2 90 MOVB $1, runtime·support_bmi1(SB) 91 testbmi2: 92 MOVB $0, runtime·support_bmi2(SB) 93 TESTL $(1<<8), runtime·cpuid_ebx7(SB) // check for BMI2 bit 94 JEQ testpopcnt 95 MOVB $1, runtime·support_bmi2(SB) 96 testpopcnt: 97 MOVB $0, runtime·support_popcnt(SB) 98 TESTL $(1<<23), runtime·cpuid_ecx(SB) // check for POPCNT bit 99 JEQ nocpuinfo 100 MOVB $1, runtime·support_popcnt(SB) 101 nocpuinfo: 102 103 // if there is an _cgo_init, call it. 104 MOVQ _cgo_init(SB), AX 105 TESTQ AX, AX 106 JZ needtls 107 // g0 already in DI 108 MOVQ DI, CX // Win64 uses CX for first parameter 109 MOVQ $setg_gcc<>(SB), SI 110 CALL AX 111 112 // update stackguard after _cgo_init 113 MOVQ $runtime·g0(SB), CX 114 MOVQ (g_stack+stack_lo)(CX), AX 115 ADDQ $const__StackGuard, AX 116 MOVQ AX, g_stackguard0(CX) 117 MOVQ AX, g_stackguard1(CX) 118 119 #ifndef GOOS_windows 120 JMP ok 121 #endif 122 needtls: 123 #ifdef GOOS_plan9 124 // skip TLS setup on Plan 9 125 JMP ok 126 #endif 127 #ifdef GOOS_solaris 128 // skip TLS setup on Solaris 129 JMP ok 130 #endif 131 132 LEAQ runtime·m0+m_tls(SB), DI 133 CALL runtime·settls(SB) 134 135 // store through it, to make sure it works 136 get_tls(BX) 137 MOVQ $0x123, g(BX) 138 MOVQ runtime·m0+m_tls(SB), AX 139 CMPQ AX, $0x123 140 JEQ 2(PC) 141 MOVL AX, 0 // abort 142 ok: 143 // set the per-goroutine and per-mach "registers" 144 get_tls(BX) 145 LEAQ runtime·g0(SB), CX 146 MOVQ CX, g(BX) 147 LEAQ runtime·m0(SB), AX 148 149 // save m->g0 = g0 150 MOVQ CX, m_g0(AX) 151 // save m0 to g0->m 152 MOVQ AX, g_m(CX) 153 154 CLD // convention is D is always left cleared 155 CALL runtime·check(SB) 156 157 MOVL 16(SP), AX // copy argc 158 MOVL AX, 0(SP) 159 MOVQ 24(SP), AX // copy argv 160 MOVQ AX, 8(SP) 161 CALL runtime·args(SB) 162 CALL runtime·osinit(SB) 163 CALL runtime·schedinit(SB) 164 165 // create a new goroutine to start program 166 MOVQ $runtime·mainPC(SB), AX // entry 167 PUSHQ AX 168 PUSHQ $0 // arg size 169 CALL runtime·newproc(SB) 170 POPQ AX 171 POPQ AX 172 173 // start this M 174 CALL runtime·mstart(SB) 175 176 MOVL $0xf1, 0xf1 // crash 177 RET 178 179 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 180 GLOBL runtime·mainPC(SB),RODATA,$8 181 182 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 183 BYTE $0xcc 184 RET 185 186 TEXT runtime·asminit(SB),NOSPLIT,$0-0 187 // No per-thread init. 188 RET 189 190 /* 191 * go-routine 192 */ 193 194 // void gosave(Gobuf*) 195 // save state in Gobuf; setjmp 196 TEXT runtime·gosave(SB), NOSPLIT, $0-8 197 MOVQ buf+0(FP), AX // gobuf 198 LEAQ buf+0(FP), BX // caller's SP 199 MOVQ BX, gobuf_sp(AX) 200 MOVQ 0(SP), BX // caller's PC 201 MOVQ BX, gobuf_pc(AX) 202 MOVQ $0, gobuf_ret(AX) 203 MOVQ BP, gobuf_bp(AX) 204 // Assert ctxt is zero. See func save. 205 MOVQ gobuf_ctxt(AX), BX 206 TESTQ BX, BX 207 JZ 2(PC) 208 CALL runtime·badctxt(SB) 209 get_tls(CX) 210 MOVQ g(CX), BX 211 MOVQ BX, gobuf_g(AX) 212 RET 213 214 // void gogo(Gobuf*) 215 // restore state from Gobuf; longjmp 216 TEXT runtime·gogo(SB), NOSPLIT, $16-8 217 MOVQ buf+0(FP), BX // gobuf 218 219 // If ctxt is not nil, invoke deletion barrier before overwriting. 220 MOVQ gobuf_ctxt(BX), AX 221 TESTQ AX, AX 222 JZ nilctxt 223 LEAQ gobuf_ctxt(BX), AX 224 MOVQ AX, 0(SP) 225 MOVQ $0, 8(SP) 226 CALL runtime·writebarrierptr_prewrite(SB) 227 MOVQ buf+0(FP), BX 228 229 nilctxt: 230 MOVQ gobuf_g(BX), DX 231 MOVQ 0(DX), CX // make sure g != nil 232 get_tls(CX) 233 MOVQ DX, g(CX) 234 MOVQ gobuf_sp(BX), SP // restore SP 235 MOVQ gobuf_ret(BX), AX 236 MOVQ gobuf_ctxt(BX), DX 237 MOVQ gobuf_bp(BX), BP 238 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 239 MOVQ $0, gobuf_ret(BX) 240 MOVQ $0, gobuf_ctxt(BX) 241 MOVQ $0, gobuf_bp(BX) 242 MOVQ gobuf_pc(BX), BX 243 JMP BX 244 245 // func mcall(fn func(*g)) 246 // Switch to m->g0's stack, call fn(g). 247 // Fn must never return. It should gogo(&g->sched) 248 // to keep running g. 249 TEXT runtime·mcall(SB), NOSPLIT, $0-8 250 MOVQ fn+0(FP), DI 251 252 get_tls(CX) 253 MOVQ g(CX), AX // save state in g->sched 254 MOVQ 0(SP), BX // caller's PC 255 MOVQ BX, (g_sched+gobuf_pc)(AX) 256 LEAQ fn+0(FP), BX // caller's SP 257 MOVQ BX, (g_sched+gobuf_sp)(AX) 258 MOVQ AX, (g_sched+gobuf_g)(AX) 259 MOVQ BP, (g_sched+gobuf_bp)(AX) 260 261 // switch to m->g0 & its stack, call fn 262 MOVQ g(CX), BX 263 MOVQ g_m(BX), BX 264 MOVQ m_g0(BX), SI 265 CMPQ SI, AX // if g == m->g0 call badmcall 266 JNE 3(PC) 267 MOVQ $runtime·badmcall(SB), AX 268 JMP AX 269 MOVQ SI, g(CX) // g = m->g0 270 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 271 PUSHQ AX 272 MOVQ DI, DX 273 MOVQ 0(DI), DI 274 CALL DI 275 POPQ AX 276 MOVQ $runtime·badmcall2(SB), AX 277 JMP AX 278 RET 279 280 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 281 // of the G stack. We need to distinguish the routine that 282 // lives at the bottom of the G stack from the one that lives 283 // at the top of the system stack because the one at the top of 284 // the system stack terminates the stack walk (see topofstack()). 285 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 286 RET 287 288 // func systemstack(fn func()) 289 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 290 MOVQ fn+0(FP), DI // DI = fn 291 get_tls(CX) 292 MOVQ g(CX), AX // AX = g 293 MOVQ g_m(AX), BX // BX = m 294 295 MOVQ m_gsignal(BX), DX // DX = gsignal 296 CMPQ AX, DX 297 JEQ noswitch 298 299 MOVQ m_g0(BX), DX // DX = g0 300 CMPQ AX, DX 301 JEQ noswitch 302 303 MOVQ m_curg(BX), R8 304 CMPQ AX, R8 305 JEQ switch 306 307 // Bad: g is not gsignal, not g0, not curg. What is it? 308 MOVQ $runtime·badsystemstack(SB), AX 309 CALL AX 310 311 switch: 312 // save our state in g->sched. Pretend to 313 // be systemstack_switch if the G stack is scanned. 314 MOVQ $runtime·systemstack_switch(SB), SI 315 MOVQ SI, (g_sched+gobuf_pc)(AX) 316 MOVQ SP, (g_sched+gobuf_sp)(AX) 317 MOVQ AX, (g_sched+gobuf_g)(AX) 318 MOVQ BP, (g_sched+gobuf_bp)(AX) 319 320 // switch to g0 321 MOVQ DX, g(CX) 322 MOVQ (g_sched+gobuf_sp)(DX), BX 323 // make it look like mstart called systemstack on g0, to stop traceback 324 SUBQ $8, BX 325 MOVQ $runtime·mstart(SB), DX 326 MOVQ DX, 0(BX) 327 MOVQ BX, SP 328 329 // call target function 330 MOVQ DI, DX 331 MOVQ 0(DI), DI 332 CALL DI 333 334 // switch back to g 335 get_tls(CX) 336 MOVQ g(CX), AX 337 MOVQ g_m(AX), BX 338 MOVQ m_curg(BX), AX 339 MOVQ AX, g(CX) 340 MOVQ (g_sched+gobuf_sp)(AX), SP 341 MOVQ $0, (g_sched+gobuf_sp)(AX) 342 RET 343 344 noswitch: 345 // already on m stack, just call directly 346 MOVQ DI, DX 347 MOVQ 0(DI), DI 348 CALL DI 349 RET 350 351 /* 352 * support for morestack 353 */ 354 355 // Called during function prolog when more stack is needed. 356 // 357 // The traceback routines see morestack on a g0 as being 358 // the top of a stack (for example, morestack calling newstack 359 // calling the scheduler calling newm calling gc), so we must 360 // record an argument size. For that purpose, it has no arguments. 361 TEXT runtime·morestack(SB),NOSPLIT,$0-0 362 // Cannot grow scheduler stack (m->g0). 363 get_tls(CX) 364 MOVQ g(CX), BX 365 MOVQ g_m(BX), BX 366 MOVQ m_g0(BX), SI 367 CMPQ g(CX), SI 368 JNE 3(PC) 369 CALL runtime·badmorestackg0(SB) 370 INT $3 371 372 // Cannot grow signal stack (m->gsignal). 373 MOVQ m_gsignal(BX), SI 374 CMPQ g(CX), SI 375 JNE 3(PC) 376 CALL runtime·badmorestackgsignal(SB) 377 INT $3 378 379 // Called from f. 380 // Set m->morebuf to f's caller. 381 MOVQ 8(SP), AX // f's caller's PC 382 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 383 LEAQ 16(SP), AX // f's caller's SP 384 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 385 get_tls(CX) 386 MOVQ g(CX), SI 387 MOVQ SI, (m_morebuf+gobuf_g)(BX) 388 389 // Set g->sched to context in f. 390 MOVQ 0(SP), AX // f's PC 391 MOVQ AX, (g_sched+gobuf_pc)(SI) 392 MOVQ SI, (g_sched+gobuf_g)(SI) 393 LEAQ 8(SP), AX // f's SP 394 MOVQ AX, (g_sched+gobuf_sp)(SI) 395 MOVQ BP, (g_sched+gobuf_bp)(SI) 396 // newstack will fill gobuf.ctxt. 397 398 // Call newstack on m->g0's stack. 399 MOVQ m_g0(BX), BX 400 MOVQ BX, g(CX) 401 MOVQ (g_sched+gobuf_sp)(BX), SP 402 PUSHQ DX // ctxt argument 403 CALL runtime·newstack(SB) 404 MOVQ $0, 0x1003 // crash if newstack returns 405 POPQ DX // keep balance check happy 406 RET 407 408 // morestack but not preserving ctxt. 409 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 410 MOVL $0, DX 411 JMP runtime·morestack(SB) 412 413 // reflectcall: call a function with the given argument list 414 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 415 // we don't have variable-sized frames, so we use a small number 416 // of constant-sized-frame functions to encode a few bits of size in the pc. 417 // Caution: ugly multiline assembly macros in your future! 418 419 #define DISPATCH(NAME,MAXSIZE) \ 420 CMPQ CX, $MAXSIZE; \ 421 JA 3(PC); \ 422 MOVQ $NAME(SB), AX; \ 423 JMP AX 424 // Note: can't just "JMP NAME(SB)" - bad inlining results. 425 426 TEXT reflect·call(SB), NOSPLIT, $0-0 427 JMP ·reflectcall(SB) 428 429 TEXT ·reflectcall(SB), NOSPLIT, $0-32 430 MOVLQZX argsize+24(FP), CX 431 DISPATCH(runtime·call32, 32) 432 DISPATCH(runtime·call64, 64) 433 DISPATCH(runtime·call128, 128) 434 DISPATCH(runtime·call256, 256) 435 DISPATCH(runtime·call512, 512) 436 DISPATCH(runtime·call1024, 1024) 437 DISPATCH(runtime·call2048, 2048) 438 DISPATCH(runtime·call4096, 4096) 439 DISPATCH(runtime·call8192, 8192) 440 DISPATCH(runtime·call16384, 16384) 441 DISPATCH(runtime·call32768, 32768) 442 DISPATCH(runtime·call65536, 65536) 443 DISPATCH(runtime·call131072, 131072) 444 DISPATCH(runtime·call262144, 262144) 445 DISPATCH(runtime·call524288, 524288) 446 DISPATCH(runtime·call1048576, 1048576) 447 DISPATCH(runtime·call2097152, 2097152) 448 DISPATCH(runtime·call4194304, 4194304) 449 DISPATCH(runtime·call8388608, 8388608) 450 DISPATCH(runtime·call16777216, 16777216) 451 DISPATCH(runtime·call33554432, 33554432) 452 DISPATCH(runtime·call67108864, 67108864) 453 DISPATCH(runtime·call134217728, 134217728) 454 DISPATCH(runtime·call268435456, 268435456) 455 DISPATCH(runtime·call536870912, 536870912) 456 DISPATCH(runtime·call1073741824, 1073741824) 457 MOVQ $runtime·badreflectcall(SB), AX 458 JMP AX 459 460 #define CALLFN(NAME,MAXSIZE) \ 461 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 462 NO_LOCAL_POINTERS; \ 463 /* copy arguments to stack */ \ 464 MOVQ argptr+16(FP), SI; \ 465 MOVLQZX argsize+24(FP), CX; \ 466 MOVQ SP, DI; \ 467 REP;MOVSB; \ 468 /* call function */ \ 469 MOVQ f+8(FP), DX; \ 470 PCDATA $PCDATA_StackMapIndex, $0; \ 471 CALL (DX); \ 472 /* copy return values back */ \ 473 MOVQ argtype+0(FP), DX; \ 474 MOVQ argptr+16(FP), DI; \ 475 MOVLQZX argsize+24(FP), CX; \ 476 MOVLQZX retoffset+28(FP), BX; \ 477 MOVQ SP, SI; \ 478 ADDQ BX, DI; \ 479 ADDQ BX, SI; \ 480 SUBQ BX, CX; \ 481 CALL callRet<>(SB); \ 482 RET 483 484 // callRet copies return values back at the end of call*. This is a 485 // separate function so it can allocate stack space for the arguments 486 // to reflectcallmove. It does not follow the Go ABI; it expects its 487 // arguments in registers. 488 TEXT callRet<>(SB), NOSPLIT, $32-0 489 NO_LOCAL_POINTERS 490 MOVQ DX, 0(SP) 491 MOVQ DI, 8(SP) 492 MOVQ SI, 16(SP) 493 MOVQ CX, 24(SP) 494 CALL runtime·reflectcallmove(SB) 495 RET 496 497 CALLFN(·call32, 32) 498 CALLFN(·call64, 64) 499 CALLFN(·call128, 128) 500 CALLFN(·call256, 256) 501 CALLFN(·call512, 512) 502 CALLFN(·call1024, 1024) 503 CALLFN(·call2048, 2048) 504 CALLFN(·call4096, 4096) 505 CALLFN(·call8192, 8192) 506 CALLFN(·call16384, 16384) 507 CALLFN(·call32768, 32768) 508 CALLFN(·call65536, 65536) 509 CALLFN(·call131072, 131072) 510 CALLFN(·call262144, 262144) 511 CALLFN(·call524288, 524288) 512 CALLFN(·call1048576, 1048576) 513 CALLFN(·call2097152, 2097152) 514 CALLFN(·call4194304, 4194304) 515 CALLFN(·call8388608, 8388608) 516 CALLFN(·call16777216, 16777216) 517 CALLFN(·call33554432, 33554432) 518 CALLFN(·call67108864, 67108864) 519 CALLFN(·call134217728, 134217728) 520 CALLFN(·call268435456, 268435456) 521 CALLFN(·call536870912, 536870912) 522 CALLFN(·call1073741824, 1073741824) 523 524 TEXT runtime·procyield(SB),NOSPLIT,$0-0 525 MOVL cycles+0(FP), AX 526 again: 527 PAUSE 528 SUBL $1, AX 529 JNZ again 530 RET 531 532 533 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 534 // Stores are already ordered on x86, so this is just a 535 // compile barrier. 536 RET 537 538 // void jmpdefer(fn, sp); 539 // called from deferreturn. 540 // 1. pop the caller 541 // 2. sub 5 bytes from the callers return 542 // 3. jmp to the argument 543 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 544 MOVQ fv+0(FP), DX // fn 545 MOVQ argp+8(FP), BX // caller sp 546 LEAQ -8(BX), SP // caller sp after CALL 547 MOVQ -8(SP), BP // restore BP as if deferreturn returned (harmless if framepointers not in use) 548 SUBQ $5, (SP) // return to CALL again 549 MOVQ 0(DX), BX 550 JMP BX // but first run the deferred function 551 552 // Save state of caller into g->sched. Smashes R8, R9. 553 TEXT gosave<>(SB),NOSPLIT,$0 554 get_tls(R8) 555 MOVQ g(R8), R8 556 MOVQ 0(SP), R9 557 MOVQ R9, (g_sched+gobuf_pc)(R8) 558 LEAQ 8(SP), R9 559 MOVQ R9, (g_sched+gobuf_sp)(R8) 560 MOVQ $0, (g_sched+gobuf_ret)(R8) 561 MOVQ BP, (g_sched+gobuf_bp)(R8) 562 // Assert ctxt is zero. See func save. 563 MOVQ (g_sched+gobuf_ctxt)(R8), R9 564 TESTQ R9, R9 565 JZ 2(PC) 566 CALL runtime·badctxt(SB) 567 RET 568 569 // func asmcgocall(fn, arg unsafe.Pointer) int32 570 // Call fn(arg) on the scheduler stack, 571 // aligned appropriately for the gcc ABI. 572 // See cgocall.go for more details. 573 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 574 MOVQ fn+0(FP), AX 575 MOVQ arg+8(FP), BX 576 577 MOVQ SP, DX 578 579 // Figure out if we need to switch to m->g0 stack. 580 // We get called to create new OS threads too, and those 581 // come in on the m->g0 stack already. 582 get_tls(CX) 583 MOVQ g(CX), R8 584 CMPQ R8, $0 585 JEQ nosave 586 MOVQ g_m(R8), R8 587 MOVQ m_g0(R8), SI 588 MOVQ g(CX), DI 589 CMPQ SI, DI 590 JEQ nosave 591 MOVQ m_gsignal(R8), SI 592 CMPQ SI, DI 593 JEQ nosave 594 595 // Switch to system stack. 596 MOVQ m_g0(R8), SI 597 CALL gosave<>(SB) 598 MOVQ SI, g(CX) 599 MOVQ (g_sched+gobuf_sp)(SI), SP 600 601 // Now on a scheduling stack (a pthread-created stack). 602 // Make sure we have enough room for 4 stack-backed fast-call 603 // registers as per windows amd64 calling convention. 604 SUBQ $64, SP 605 ANDQ $~15, SP // alignment for gcc ABI 606 MOVQ DI, 48(SP) // save g 607 MOVQ (g_stack+stack_hi)(DI), DI 608 SUBQ DX, DI 609 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 610 MOVQ BX, DI // DI = first argument in AMD64 ABI 611 MOVQ BX, CX // CX = first argument in Win64 612 CALL AX 613 614 // Restore registers, g, stack pointer. 615 get_tls(CX) 616 MOVQ 48(SP), DI 617 MOVQ (g_stack+stack_hi)(DI), SI 618 SUBQ 40(SP), SI 619 MOVQ DI, g(CX) 620 MOVQ SI, SP 621 622 MOVL AX, ret+16(FP) 623 RET 624 625 nosave: 626 // Running on a system stack, perhaps even without a g. 627 // Having no g can happen during thread creation or thread teardown 628 // (see needm/dropm on Solaris, for example). 629 // This code is like the above sequence but without saving/restoring g 630 // and without worrying about the stack moving out from under us 631 // (because we're on a system stack, not a goroutine stack). 632 // The above code could be used directly if already on a system stack, 633 // but then the only path through this code would be a rare case on Solaris. 634 // Using this code for all "already on system stack" calls exercises it more, 635 // which should help keep it correct. 636 SUBQ $64, SP 637 ANDQ $~15, SP 638 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging 639 MOVQ DX, 40(SP) // save original stack pointer 640 MOVQ BX, DI // DI = first argument in AMD64 ABI 641 MOVQ BX, CX // CX = first argument in Win64 642 CALL AX 643 MOVQ 40(SP), SI // restore original stack pointer 644 MOVQ SI, SP 645 MOVL AX, ret+16(FP) 646 RET 647 648 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt) 649 // Turn the fn into a Go func (by taking its address) and call 650 // cgocallback_gofunc. 651 TEXT runtime·cgocallback(SB),NOSPLIT,$32-32 652 LEAQ fn+0(FP), AX 653 MOVQ AX, 0(SP) 654 MOVQ frame+8(FP), AX 655 MOVQ AX, 8(SP) 656 MOVQ framesize+16(FP), AX 657 MOVQ AX, 16(SP) 658 MOVQ ctxt+24(FP), AX 659 MOVQ AX, 24(SP) 660 MOVQ $runtime·cgocallback_gofunc(SB), AX 661 CALL AX 662 RET 663 664 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt) 665 // See cgocall.go for more details. 666 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32 667 NO_LOCAL_POINTERS 668 669 // If g is nil, Go did not create the current thread. 670 // Call needm to obtain one m for temporary use. 671 // In this case, we're running on the thread stack, so there's 672 // lots of space, but the linker doesn't know. Hide the call from 673 // the linker analysis by using an indirect call through AX. 674 get_tls(CX) 675 #ifdef GOOS_windows 676 MOVL $0, BX 677 CMPQ CX, $0 678 JEQ 2(PC) 679 #endif 680 MOVQ g(CX), BX 681 CMPQ BX, $0 682 JEQ needm 683 MOVQ g_m(BX), BX 684 MOVQ BX, R8 // holds oldm until end of function 685 JMP havem 686 needm: 687 MOVQ $0, 0(SP) 688 MOVQ $runtime·needm(SB), AX 689 CALL AX 690 MOVQ 0(SP), R8 691 get_tls(CX) 692 MOVQ g(CX), BX 693 MOVQ g_m(BX), BX 694 695 // Set m->sched.sp = SP, so that if a panic happens 696 // during the function we are about to execute, it will 697 // have a valid SP to run on the g0 stack. 698 // The next few lines (after the havem label) 699 // will save this SP onto the stack and then write 700 // the same SP back to m->sched.sp. That seems redundant, 701 // but if an unrecovered panic happens, unwindm will 702 // restore the g->sched.sp from the stack location 703 // and then systemstack will try to use it. If we don't set it here, 704 // that restored SP will be uninitialized (typically 0) and 705 // will not be usable. 706 MOVQ m_g0(BX), SI 707 MOVQ SP, (g_sched+gobuf_sp)(SI) 708 709 havem: 710 // Now there's a valid m, and we're running on its m->g0. 711 // Save current m->g0->sched.sp on stack and then set it to SP. 712 // Save current sp in m->g0->sched.sp in preparation for 713 // switch back to m->curg stack. 714 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 715 MOVQ m_g0(BX), SI 716 MOVQ (g_sched+gobuf_sp)(SI), AX 717 MOVQ AX, 0(SP) 718 MOVQ SP, (g_sched+gobuf_sp)(SI) 719 720 // Switch to m->curg stack and call runtime.cgocallbackg. 721 // Because we are taking over the execution of m->curg 722 // but *not* resuming what had been running, we need to 723 // save that information (m->curg->sched) so we can restore it. 724 // We can restore m->curg->sched.sp easily, because calling 725 // runtime.cgocallbackg leaves SP unchanged upon return. 726 // To save m->curg->sched.pc, we push it onto the stack. 727 // This has the added benefit that it looks to the traceback 728 // routine like cgocallbackg is going to return to that 729 // PC (because the frame we allocate below has the same 730 // size as cgocallback_gofunc's frame declared above) 731 // so that the traceback will seamlessly trace back into 732 // the earlier calls. 733 // 734 // In the new goroutine, 8(SP) holds the saved R8. 735 MOVQ m_curg(BX), SI 736 MOVQ SI, g(CX) 737 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 738 MOVQ (g_sched+gobuf_pc)(SI), BX 739 MOVQ BX, -8(DI) 740 // Compute the size of the frame, including return PC and, if 741 // GOEXPERIMENT=framepointer, the saved base pointer 742 MOVQ ctxt+24(FP), BX 743 LEAQ fv+0(FP), AX 744 SUBQ SP, AX 745 SUBQ AX, DI 746 MOVQ DI, SP 747 748 MOVQ R8, 8(SP) 749 MOVQ BX, 0(SP) 750 CALL runtime·cgocallbackg(SB) 751 MOVQ 8(SP), R8 752 753 // Compute the size of the frame again. FP and SP have 754 // completely different values here than they did above, 755 // but only their difference matters. 756 LEAQ fv+0(FP), AX 757 SUBQ SP, AX 758 759 // Restore g->sched (== m->curg->sched) from saved values. 760 get_tls(CX) 761 MOVQ g(CX), SI 762 MOVQ SP, DI 763 ADDQ AX, DI 764 MOVQ -8(DI), BX 765 MOVQ BX, (g_sched+gobuf_pc)(SI) 766 MOVQ DI, (g_sched+gobuf_sp)(SI) 767 768 // Switch back to m->g0's stack and restore m->g0->sched.sp. 769 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 770 // so we do not have to restore it.) 771 MOVQ g(CX), BX 772 MOVQ g_m(BX), BX 773 MOVQ m_g0(BX), SI 774 MOVQ SI, g(CX) 775 MOVQ (g_sched+gobuf_sp)(SI), SP 776 MOVQ 0(SP), AX 777 MOVQ AX, (g_sched+gobuf_sp)(SI) 778 779 // If the m on entry was nil, we called needm above to borrow an m 780 // for the duration of the call. Since the call is over, return it with dropm. 781 CMPQ R8, $0 782 JNE 3(PC) 783 MOVQ $runtime·dropm(SB), AX 784 CALL AX 785 786 // Done! 787 RET 788 789 // void setg(G*); set g. for use by needm. 790 TEXT runtime·setg(SB), NOSPLIT, $0-8 791 MOVQ gg+0(FP), BX 792 #ifdef GOOS_windows 793 CMPQ BX, $0 794 JNE settls 795 MOVQ $0, 0x28(GS) 796 RET 797 settls: 798 MOVQ g_m(BX), AX 799 LEAQ m_tls(AX), AX 800 MOVQ AX, 0x28(GS) 801 #endif 802 get_tls(CX) 803 MOVQ BX, g(CX) 804 RET 805 806 // void setg_gcc(G*); set g called from gcc. 807 TEXT setg_gcc<>(SB),NOSPLIT,$0 808 get_tls(AX) 809 MOVQ DI, g(AX) 810 RET 811 812 // check that SP is in range [g->stack.lo, g->stack.hi) 813 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 814 get_tls(CX) 815 MOVQ g(CX), AX 816 CMPQ (g_stack+stack_hi)(AX), SP 817 JHI 2(PC) 818 INT $3 819 CMPQ SP, (g_stack+stack_lo)(AX) 820 JHI 2(PC) 821 INT $3 822 RET 823 824 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16 825 MOVQ argp+0(FP),AX // addr of first arg 826 MOVQ -8(AX),AX // get calling pc 827 MOVQ AX, ret+8(FP) 828 RET 829 830 // func cputicks() int64 831 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 832 CMPB runtime·lfenceBeforeRdtsc(SB), $1 833 JNE mfence 834 LFENCE 835 JMP done 836 mfence: 837 MFENCE 838 done: 839 RDTSC 840 SHLQ $32, DX 841 ADDQ DX, AX 842 MOVQ AX, ret+0(FP) 843 RET 844 845 // memhash_varlen(p unsafe.Pointer, h seed) uintptr 846 // redirects to memhash(p, h, size) using the size 847 // stored in the closure. 848 TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24 849 GO_ARGS 850 NO_LOCAL_POINTERS 851 MOVQ p+0(FP), AX 852 MOVQ h+8(FP), BX 853 MOVQ 8(DX), CX 854 MOVQ AX, 0(SP) 855 MOVQ BX, 8(SP) 856 MOVQ CX, 16(SP) 857 CALL runtime·memhash(SB) 858 MOVQ 24(SP), AX 859 MOVQ AX, ret+16(FP) 860 RET 861 862 // hash function using AES hardware instructions 863 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 864 MOVQ p+0(FP), AX // ptr to data 865 MOVQ s+16(FP), CX // size 866 LEAQ ret+24(FP), DX 867 JMP runtime·aeshashbody(SB) 868 869 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 870 MOVQ p+0(FP), AX // ptr to string struct 871 MOVQ 8(AX), CX // length of string 872 MOVQ (AX), AX // string data 873 LEAQ ret+16(FP), DX 874 JMP runtime·aeshashbody(SB) 875 876 // AX: data 877 // CX: length 878 // DX: address to put return value 879 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 880 // Fill an SSE register with our seeds. 881 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 882 PINSRW $4, CX, X0 // 16 bits of length 883 PSHUFHW $0, X0, X0 // repeat length 4 times total 884 MOVO X0, X1 // save unscrambled seed 885 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 886 AESENC X0, X0 // scramble seed 887 888 CMPQ CX, $16 889 JB aes0to15 890 JE aes16 891 CMPQ CX, $32 892 JBE aes17to32 893 CMPQ CX, $64 894 JBE aes33to64 895 CMPQ CX, $128 896 JBE aes65to128 897 JMP aes129plus 898 899 aes0to15: 900 TESTQ CX, CX 901 JE aes0 902 903 ADDQ $16, AX 904 TESTW $0xff0, AX 905 JE endofpage 906 907 // 16 bytes loaded at this address won't cross 908 // a page boundary, so we can load it directly. 909 MOVOU -16(AX), X1 910 ADDQ CX, CX 911 MOVQ $masks<>(SB), AX 912 PAND (AX)(CX*8), X1 913 final1: 914 PXOR X0, X1 // xor data with seed 915 AESENC X1, X1 // scramble combo 3 times 916 AESENC X1, X1 917 AESENC X1, X1 918 MOVQ X1, (DX) 919 RET 920 921 endofpage: 922 // address ends in 1111xxxx. Might be up against 923 // a page boundary, so load ending at last byte. 924 // Then shift bytes down using pshufb. 925 MOVOU -32(AX)(CX*1), X1 926 ADDQ CX, CX 927 MOVQ $shifts<>(SB), AX 928 PSHUFB (AX)(CX*8), X1 929 JMP final1 930 931 aes0: 932 // Return scrambled input seed 933 AESENC X0, X0 934 MOVQ X0, (DX) 935 RET 936 937 aes16: 938 MOVOU (AX), X1 939 JMP final1 940 941 aes17to32: 942 // make second starting seed 943 PXOR runtime·aeskeysched+16(SB), X1 944 AESENC X1, X1 945 946 // load data to be hashed 947 MOVOU (AX), X2 948 MOVOU -16(AX)(CX*1), X3 949 950 // xor with seed 951 PXOR X0, X2 952 PXOR X1, X3 953 954 // scramble 3 times 955 AESENC X2, X2 956 AESENC X3, X3 957 AESENC X2, X2 958 AESENC X3, X3 959 AESENC X2, X2 960 AESENC X3, X3 961 962 // combine results 963 PXOR X3, X2 964 MOVQ X2, (DX) 965 RET 966 967 aes33to64: 968 // make 3 more starting seeds 969 MOVO X1, X2 970 MOVO X1, X3 971 PXOR runtime·aeskeysched+16(SB), X1 972 PXOR runtime·aeskeysched+32(SB), X2 973 PXOR runtime·aeskeysched+48(SB), X3 974 AESENC X1, X1 975 AESENC X2, X2 976 AESENC X3, X3 977 978 MOVOU (AX), X4 979 MOVOU 16(AX), X5 980 MOVOU -32(AX)(CX*1), X6 981 MOVOU -16(AX)(CX*1), X7 982 983 PXOR X0, X4 984 PXOR X1, X5 985 PXOR X2, X6 986 PXOR X3, X7 987 988 AESENC X4, X4 989 AESENC X5, X5 990 AESENC X6, X6 991 AESENC X7, X7 992 993 AESENC X4, X4 994 AESENC X5, X5 995 AESENC X6, X6 996 AESENC X7, X7 997 998 AESENC X4, X4 999 AESENC X5, X5 1000 AESENC X6, X6 1001 AESENC X7, X7 1002 1003 PXOR X6, X4 1004 PXOR X7, X5 1005 PXOR X5, X4 1006 MOVQ X4, (DX) 1007 RET 1008 1009 aes65to128: 1010 // make 7 more starting seeds 1011 MOVO X1, X2 1012 MOVO X1, X3 1013 MOVO X1, X4 1014 MOVO X1, X5 1015 MOVO X1, X6 1016 MOVO X1, X7 1017 PXOR runtime·aeskeysched+16(SB), X1 1018 PXOR runtime·aeskeysched+32(SB), X2 1019 PXOR runtime·aeskeysched+48(SB), X3 1020 PXOR runtime·aeskeysched+64(SB), X4 1021 PXOR runtime·aeskeysched+80(SB), X5 1022 PXOR runtime·aeskeysched+96(SB), X6 1023 PXOR runtime·aeskeysched+112(SB), X7 1024 AESENC X1, X1 1025 AESENC X2, X2 1026 AESENC X3, X3 1027 AESENC X4, X4 1028 AESENC X5, X5 1029 AESENC X6, X6 1030 AESENC X7, X7 1031 1032 // load data 1033 MOVOU (AX), X8 1034 MOVOU 16(AX), X9 1035 MOVOU 32(AX), X10 1036 MOVOU 48(AX), X11 1037 MOVOU -64(AX)(CX*1), X12 1038 MOVOU -48(AX)(CX*1), X13 1039 MOVOU -32(AX)(CX*1), X14 1040 MOVOU -16(AX)(CX*1), X15 1041 1042 // xor with seed 1043 PXOR X0, X8 1044 PXOR X1, X9 1045 PXOR X2, X10 1046 PXOR X3, X11 1047 PXOR X4, X12 1048 PXOR X5, X13 1049 PXOR X6, X14 1050 PXOR X7, X15 1051 1052 // scramble 3 times 1053 AESENC X8, X8 1054 AESENC X9, X9 1055 AESENC X10, X10 1056 AESENC X11, X11 1057 AESENC X12, X12 1058 AESENC X13, X13 1059 AESENC X14, X14 1060 AESENC X15, X15 1061 1062 AESENC X8, X8 1063 AESENC X9, X9 1064 AESENC X10, X10 1065 AESENC X11, X11 1066 AESENC X12, X12 1067 AESENC X13, X13 1068 AESENC X14, X14 1069 AESENC X15, X15 1070 1071 AESENC X8, X8 1072 AESENC X9, X9 1073 AESENC X10, X10 1074 AESENC X11, X11 1075 AESENC X12, X12 1076 AESENC X13, X13 1077 AESENC X14, X14 1078 AESENC X15, X15 1079 1080 // combine results 1081 PXOR X12, X8 1082 PXOR X13, X9 1083 PXOR X14, X10 1084 PXOR X15, X11 1085 PXOR X10, X8 1086 PXOR X11, X9 1087 PXOR X9, X8 1088 MOVQ X8, (DX) 1089 RET 1090 1091 aes129plus: 1092 // make 7 more starting seeds 1093 MOVO X1, X2 1094 MOVO X1, X3 1095 MOVO X1, X4 1096 MOVO X1, X5 1097 MOVO X1, X6 1098 MOVO X1, X7 1099 PXOR runtime·aeskeysched+16(SB), X1 1100 PXOR runtime·aeskeysched+32(SB), X2 1101 PXOR runtime·aeskeysched+48(SB), X3 1102 PXOR runtime·aeskeysched+64(SB), X4 1103 PXOR runtime·aeskeysched+80(SB), X5 1104 PXOR runtime·aeskeysched+96(SB), X6 1105 PXOR runtime·aeskeysched+112(SB), X7 1106 AESENC X1, X1 1107 AESENC X2, X2 1108 AESENC X3, X3 1109 AESENC X4, X4 1110 AESENC X5, X5 1111 AESENC X6, X6 1112 AESENC X7, X7 1113 1114 // start with last (possibly overlapping) block 1115 MOVOU -128(AX)(CX*1), X8 1116 MOVOU -112(AX)(CX*1), X9 1117 MOVOU -96(AX)(CX*1), X10 1118 MOVOU -80(AX)(CX*1), X11 1119 MOVOU -64(AX)(CX*1), X12 1120 MOVOU -48(AX)(CX*1), X13 1121 MOVOU -32(AX)(CX*1), X14 1122 MOVOU -16(AX)(CX*1), X15 1123 1124 // xor in seed 1125 PXOR X0, X8 1126 PXOR X1, X9 1127 PXOR X2, X10 1128 PXOR X3, X11 1129 PXOR X4, X12 1130 PXOR X5, X13 1131 PXOR X6, X14 1132 PXOR X7, X15 1133 1134 // compute number of remaining 128-byte blocks 1135 DECQ CX 1136 SHRQ $7, CX 1137 1138 aesloop: 1139 // scramble state 1140 AESENC X8, X8 1141 AESENC X9, X9 1142 AESENC X10, X10 1143 AESENC X11, X11 1144 AESENC X12, X12 1145 AESENC X13, X13 1146 AESENC X14, X14 1147 AESENC X15, X15 1148 1149 // scramble state, xor in a block 1150 MOVOU (AX), X0 1151 MOVOU 16(AX), X1 1152 MOVOU 32(AX), X2 1153 MOVOU 48(AX), X3 1154 AESENC X0, X8 1155 AESENC X1, X9 1156 AESENC X2, X10 1157 AESENC X3, X11 1158 MOVOU 64(AX), X4 1159 MOVOU 80(AX), X5 1160 MOVOU 96(AX), X6 1161 MOVOU 112(AX), X7 1162 AESENC X4, X12 1163 AESENC X5, X13 1164 AESENC X6, X14 1165 AESENC X7, X15 1166 1167 ADDQ $128, AX 1168 DECQ CX 1169 JNE aesloop 1170 1171 // 3 more scrambles to finish 1172 AESENC X8, X8 1173 AESENC X9, X9 1174 AESENC X10, X10 1175 AESENC X11, X11 1176 AESENC X12, X12 1177 AESENC X13, X13 1178 AESENC X14, X14 1179 AESENC X15, X15 1180 AESENC X8, X8 1181 AESENC X9, X9 1182 AESENC X10, X10 1183 AESENC X11, X11 1184 AESENC X12, X12 1185 AESENC X13, X13 1186 AESENC X14, X14 1187 AESENC X15, X15 1188 AESENC X8, X8 1189 AESENC X9, X9 1190 AESENC X10, X10 1191 AESENC X11, X11 1192 AESENC X12, X12 1193 AESENC X13, X13 1194 AESENC X14, X14 1195 AESENC X15, X15 1196 1197 PXOR X12, X8 1198 PXOR X13, X9 1199 PXOR X14, X10 1200 PXOR X15, X11 1201 PXOR X10, X8 1202 PXOR X11, X9 1203 PXOR X9, X8 1204 MOVQ X8, (DX) 1205 RET 1206 1207 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1208 MOVQ p+0(FP), AX // ptr to data 1209 MOVQ h+8(FP), X0 // seed 1210 PINSRD $2, (AX), X0 // data 1211 AESENC runtime·aeskeysched+0(SB), X0 1212 AESENC runtime·aeskeysched+16(SB), X0 1213 AESENC runtime·aeskeysched+32(SB), X0 1214 MOVQ X0, ret+16(FP) 1215 RET 1216 1217 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1218 MOVQ p+0(FP), AX // ptr to data 1219 MOVQ h+8(FP), X0 // seed 1220 PINSRQ $1, (AX), X0 // data 1221 AESENC runtime·aeskeysched+0(SB), X0 1222 AESENC runtime·aeskeysched+16(SB), X0 1223 AESENC runtime·aeskeysched+32(SB), X0 1224 MOVQ X0, ret+16(FP) 1225 RET 1226 1227 // simple mask to get rid of data in the high part of the register. 1228 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1229 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1230 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1231 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1232 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1233 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1234 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1235 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1236 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1237 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1238 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1239 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1240 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1241 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1242 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1243 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1244 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1245 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1246 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1247 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1248 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1249 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1250 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1251 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1252 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1253 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1254 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1255 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1256 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1257 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1258 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1259 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1260 GLOBL masks<>(SB),RODATA,$256 1261 1262 TEXT ·checkASM(SB),NOSPLIT,$0-1 1263 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte 1264 MOVQ $masks<>(SB), AX 1265 MOVQ $shifts<>(SB), BX 1266 ORQ BX, AX 1267 TESTQ $15, AX 1268 SETEQ ret+0(FP) 1269 RET 1270 1271 // these are arguments to pshufb. They move data down from 1272 // the high bytes of the register to the low bytes of the register. 1273 // index is how many bytes to move. 1274 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1275 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1276 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1277 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1278 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1279 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1280 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1281 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1282 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1283 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1284 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1285 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1286 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1287 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1288 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1289 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1290 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1291 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1292 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1293 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1294 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1295 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1296 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1297 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1298 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1299 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1300 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1301 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1302 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1303 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1304 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1305 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1306 GLOBL shifts<>(SB),RODATA,$256 1307 1308 // memequal(p, q unsafe.Pointer, size uintptr) bool 1309 TEXT runtime·memequal(SB),NOSPLIT,$0-25 1310 MOVQ a+0(FP), SI 1311 MOVQ b+8(FP), DI 1312 CMPQ SI, DI 1313 JEQ eq 1314 MOVQ size+16(FP), BX 1315 LEAQ ret+24(FP), AX 1316 JMP runtime·memeqbody(SB) 1317 eq: 1318 MOVB $1, ret+24(FP) 1319 RET 1320 1321 // memequal_varlen(a, b unsafe.Pointer) bool 1322 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1323 MOVQ a+0(FP), SI 1324 MOVQ b+8(FP), DI 1325 CMPQ SI, DI 1326 JEQ eq 1327 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1328 LEAQ ret+16(FP), AX 1329 JMP runtime·memeqbody(SB) 1330 eq: 1331 MOVB $1, ret+16(FP) 1332 RET 1333 1334 // eqstring tests whether two strings are equal. 1335 // The compiler guarantees that strings passed 1336 // to eqstring have equal length. 1337 // See runtime_test.go:eqstring_generic for 1338 // equivalent Go code. 1339 TEXT runtime·eqstring(SB),NOSPLIT,$0-33 1340 MOVQ s1_base+0(FP), SI 1341 MOVQ s2_base+16(FP), DI 1342 CMPQ SI, DI 1343 JEQ eq 1344 MOVQ s1_len+8(FP), BX 1345 LEAQ ret+32(FP), AX 1346 JMP runtime·memeqbody(SB) 1347 eq: 1348 MOVB $1, ret+32(FP) 1349 RET 1350 1351 // a in SI 1352 // b in DI 1353 // count in BX 1354 // address of result byte in AX 1355 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1356 CMPQ BX, $8 1357 JB small 1358 CMPQ BX, $64 1359 JB bigloop 1360 CMPB runtime·support_avx2(SB), $1 1361 JE hugeloop_avx2 1362 1363 // 64 bytes at a time using xmm registers 1364 hugeloop: 1365 CMPQ BX, $64 1366 JB bigloop 1367 MOVOU (SI), X0 1368 MOVOU (DI), X1 1369 MOVOU 16(SI), X2 1370 MOVOU 16(DI), X3 1371 MOVOU 32(SI), X4 1372 MOVOU 32(DI), X5 1373 MOVOU 48(SI), X6 1374 MOVOU 48(DI), X7 1375 PCMPEQB X1, X0 1376 PCMPEQB X3, X2 1377 PCMPEQB X5, X4 1378 PCMPEQB X7, X6 1379 PAND X2, X0 1380 PAND X6, X4 1381 PAND X4, X0 1382 PMOVMSKB X0, DX 1383 ADDQ $64, SI 1384 ADDQ $64, DI 1385 SUBQ $64, BX 1386 CMPL DX, $0xffff 1387 JEQ hugeloop 1388 MOVB $0, (AX) 1389 RET 1390 1391 // 64 bytes at a time using ymm registers 1392 hugeloop_avx2: 1393 CMPQ BX, $64 1394 JB bigloop_avx2 1395 VMOVDQU (SI), Y0 1396 VMOVDQU (DI), Y1 1397 VMOVDQU 32(SI), Y2 1398 VMOVDQU 32(DI), Y3 1399 VPCMPEQB Y1, Y0, Y4 1400 VPCMPEQB Y2, Y3, Y5 1401 VPAND Y4, Y5, Y6 1402 VPMOVMSKB Y6, DX 1403 ADDQ $64, SI 1404 ADDQ $64, DI 1405 SUBQ $64, BX 1406 CMPL DX, $0xffffffff 1407 JEQ hugeloop_avx2 1408 VZEROUPPER 1409 MOVB $0, (AX) 1410 RET 1411 1412 bigloop_avx2: 1413 VZEROUPPER 1414 1415 // 8 bytes at a time using 64-bit register 1416 bigloop: 1417 CMPQ BX, $8 1418 JBE leftover 1419 MOVQ (SI), CX 1420 MOVQ (DI), DX 1421 ADDQ $8, SI 1422 ADDQ $8, DI 1423 SUBQ $8, BX 1424 CMPQ CX, DX 1425 JEQ bigloop 1426 MOVB $0, (AX) 1427 RET 1428 1429 // remaining 0-8 bytes 1430 leftover: 1431 MOVQ -8(SI)(BX*1), CX 1432 MOVQ -8(DI)(BX*1), DX 1433 CMPQ CX, DX 1434 SETEQ (AX) 1435 RET 1436 1437 small: 1438 CMPQ BX, $0 1439 JEQ equal 1440 1441 LEAQ 0(BX*8), CX 1442 NEGQ CX 1443 1444 CMPB SI, $0xf8 1445 JA si_high 1446 1447 // load at SI won't cross a page boundary. 1448 MOVQ (SI), SI 1449 JMP si_finish 1450 si_high: 1451 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1452 MOVQ -8(SI)(BX*1), SI 1453 SHRQ CX, SI 1454 si_finish: 1455 1456 // same for DI. 1457 CMPB DI, $0xf8 1458 JA di_high 1459 MOVQ (DI), DI 1460 JMP di_finish 1461 di_high: 1462 MOVQ -8(DI)(BX*1), DI 1463 SHRQ CX, DI 1464 di_finish: 1465 1466 SUBQ SI, DI 1467 SHLQ CX, DI 1468 equal: 1469 SETEQ (AX) 1470 RET 1471 1472 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1473 MOVQ s1_base+0(FP), SI 1474 MOVQ s1_len+8(FP), BX 1475 MOVQ s2_base+16(FP), DI 1476 MOVQ s2_len+24(FP), DX 1477 LEAQ ret+32(FP), R9 1478 JMP runtime·cmpbody(SB) 1479 1480 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1481 MOVQ s1+0(FP), SI 1482 MOVQ s1+8(FP), BX 1483 MOVQ s2+24(FP), DI 1484 MOVQ s2+32(FP), DX 1485 LEAQ res+48(FP), R9 1486 JMP runtime·cmpbody(SB) 1487 1488 // input: 1489 // SI = a 1490 // DI = b 1491 // BX = alen 1492 // DX = blen 1493 // R9 = address of output word (stores -1/0/1 here) 1494 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1495 CMPQ SI, DI 1496 JEQ allsame 1497 CMPQ BX, DX 1498 MOVQ DX, R8 1499 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1500 CMPQ R8, $8 1501 JB small 1502 1503 CMPQ R8, $63 1504 JBE loop 1505 CMPB runtime·support_avx2(SB), $1 1506 JEQ big_loop_avx2 1507 JMP big_loop 1508 loop: 1509 CMPQ R8, $16 1510 JBE _0through16 1511 MOVOU (SI), X0 1512 MOVOU (DI), X1 1513 PCMPEQB X0, X1 1514 PMOVMSKB X1, AX 1515 XORQ $0xffff, AX // convert EQ to NE 1516 JNE diff16 // branch if at least one byte is not equal 1517 ADDQ $16, SI 1518 ADDQ $16, DI 1519 SUBQ $16, R8 1520 JMP loop 1521 1522 diff64: 1523 ADDQ $48, SI 1524 ADDQ $48, DI 1525 JMP diff16 1526 diff48: 1527 ADDQ $32, SI 1528 ADDQ $32, DI 1529 JMP diff16 1530 diff32: 1531 ADDQ $16, SI 1532 ADDQ $16, DI 1533 // AX = bit mask of differences 1534 diff16: 1535 BSFQ AX, BX // index of first byte that differs 1536 XORQ AX, AX 1537 MOVB (SI)(BX*1), CX 1538 CMPB CX, (DI)(BX*1) 1539 SETHI AX 1540 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1541 MOVQ AX, (R9) 1542 RET 1543 1544 // 0 through 16 bytes left, alen>=8, blen>=8 1545 _0through16: 1546 CMPQ R8, $8 1547 JBE _0through8 1548 MOVQ (SI), AX 1549 MOVQ (DI), CX 1550 CMPQ AX, CX 1551 JNE diff8 1552 _0through8: 1553 MOVQ -8(SI)(R8*1), AX 1554 MOVQ -8(DI)(R8*1), CX 1555 CMPQ AX, CX 1556 JEQ allsame 1557 1558 // AX and CX contain parts of a and b that differ. 1559 diff8: 1560 BSWAPQ AX // reverse order of bytes 1561 BSWAPQ CX 1562 XORQ AX, CX 1563 BSRQ CX, CX // index of highest bit difference 1564 SHRQ CX, AX // move a's bit to bottom 1565 ANDQ $1, AX // mask bit 1566 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1567 MOVQ AX, (R9) 1568 RET 1569 1570 // 0-7 bytes in common 1571 small: 1572 LEAQ (R8*8), CX // bytes left -> bits left 1573 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1574 JEQ allsame 1575 1576 // load bytes of a into high bytes of AX 1577 CMPB SI, $0xf8 1578 JA si_high 1579 MOVQ (SI), SI 1580 JMP si_finish 1581 si_high: 1582 MOVQ -8(SI)(R8*1), SI 1583 SHRQ CX, SI 1584 si_finish: 1585 SHLQ CX, SI 1586 1587 // load bytes of b in to high bytes of BX 1588 CMPB DI, $0xf8 1589 JA di_high 1590 MOVQ (DI), DI 1591 JMP di_finish 1592 di_high: 1593 MOVQ -8(DI)(R8*1), DI 1594 SHRQ CX, DI 1595 di_finish: 1596 SHLQ CX, DI 1597 1598 BSWAPQ SI // reverse order of bytes 1599 BSWAPQ DI 1600 XORQ SI, DI // find bit differences 1601 JEQ allsame 1602 BSRQ DI, CX // index of highest bit difference 1603 SHRQ CX, SI // move a's bit to bottom 1604 ANDQ $1, SI // mask bit 1605 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1606 MOVQ AX, (R9) 1607 RET 1608 1609 allsame: 1610 XORQ AX, AX 1611 XORQ CX, CX 1612 CMPQ BX, DX 1613 SETGT AX // 1 if alen > blen 1614 SETEQ CX // 1 if alen == blen 1615 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1616 MOVQ AX, (R9) 1617 RET 1618 1619 // this works for >= 64 bytes of data. 1620 big_loop: 1621 MOVOU (SI), X0 1622 MOVOU (DI), X1 1623 PCMPEQB X0, X1 1624 PMOVMSKB X1, AX 1625 XORQ $0xffff, AX 1626 JNE diff16 1627 1628 MOVOU 16(SI), X0 1629 MOVOU 16(DI), X1 1630 PCMPEQB X0, X1 1631 PMOVMSKB X1, AX 1632 XORQ $0xffff, AX 1633 JNE diff32 1634 1635 MOVOU 32(SI), X0 1636 MOVOU 32(DI), X1 1637 PCMPEQB X0, X1 1638 PMOVMSKB X1, AX 1639 XORQ $0xffff, AX 1640 JNE diff48 1641 1642 MOVOU 48(SI), X0 1643 MOVOU 48(DI), X1 1644 PCMPEQB X0, X1 1645 PMOVMSKB X1, AX 1646 XORQ $0xffff, AX 1647 JNE diff64 1648 1649 ADDQ $64, SI 1650 ADDQ $64, DI 1651 SUBQ $64, R8 1652 CMPQ R8, $64 1653 JBE loop 1654 JMP big_loop 1655 1656 // Compare 64-bytes per loop iteration. 1657 // Loop is unrolled and uses AVX2. 1658 big_loop_avx2: 1659 VMOVDQU (SI), Y2 1660 VMOVDQU (DI), Y3 1661 VMOVDQU 32(SI), Y4 1662 VMOVDQU 32(DI), Y5 1663 VPCMPEQB Y2, Y3, Y0 1664 VPMOVMSKB Y0, AX 1665 XORL $0xffffffff, AX 1666 JNE diff32_avx2 1667 VPCMPEQB Y4, Y5, Y6 1668 VPMOVMSKB Y6, AX 1669 XORL $0xffffffff, AX 1670 JNE diff64_avx2 1671 1672 ADDQ $64, SI 1673 ADDQ $64, DI 1674 SUBQ $64, R8 1675 CMPQ R8, $64 1676 JB big_loop_avx2_exit 1677 JMP big_loop_avx2 1678 1679 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 1680 diff32_avx2: 1681 VZEROUPPER 1682 JMP diff16 1683 1684 // Same as diff32_avx2, but for last 32 bytes. 1685 diff64_avx2: 1686 VZEROUPPER 1687 JMP diff48 1688 1689 // For <64 bytes remainder jump to normal loop. 1690 big_loop_avx2_exit: 1691 VZEROUPPER 1692 JMP loop 1693 1694 1695 TEXT strings·supportAVX2(SB),NOSPLIT,$0-1 1696 MOVBLZX runtime·support_avx2(SB), AX 1697 MOVB AX, ret+0(FP) 1698 RET 1699 1700 TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1 1701 MOVBLZX runtime·support_avx2(SB), AX 1702 MOVB AX, ret+0(FP) 1703 RET 1704 1705 TEXT strings·supportPOPCNT(SB),NOSPLIT,$0-1 1706 MOVBLZX runtime·support_popcnt(SB), AX 1707 MOVB AX, ret+0(FP) 1708 RET 1709 1710 TEXT bytes·supportPOPCNT(SB),NOSPLIT,$0-1 1711 MOVBLZX runtime·support_popcnt(SB), AX 1712 MOVB AX, ret+0(FP) 1713 RET 1714 1715 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 1716 MOVQ s+0(FP), DI 1717 // We want len in DX and AX, because PCMPESTRI implicitly consumes them 1718 MOVQ s_len+8(FP), DX 1719 MOVQ c+16(FP), BP 1720 MOVQ c_len+24(FP), AX 1721 MOVQ DI, R10 1722 LEAQ ret+32(FP), R11 1723 JMP runtime·indexShortStr(SB) 1724 1725 TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56 1726 MOVQ s+0(FP), DI 1727 MOVQ s_len+8(FP), DX 1728 MOVQ c+24(FP), BP 1729 MOVQ c_len+32(FP), AX 1730 MOVQ DI, R10 1731 LEAQ ret+48(FP), R11 1732 JMP runtime·indexShortStr(SB) 1733 1734 // AX: length of string, that we are searching for 1735 // DX: length of string, in which we are searching 1736 // DI: pointer to string, in which we are searching 1737 // BP: pointer to string, that we are searching for 1738 // R11: address, where to put return value 1739 TEXT runtime·indexShortStr(SB),NOSPLIT,$0 1740 CMPQ AX, DX 1741 JA fail 1742 CMPQ DX, $16 1743 JAE sse42 1744 no_sse42: 1745 CMPQ AX, $2 1746 JA _3_or_more 1747 MOVW (BP), BP 1748 LEAQ -1(DI)(DX*1), DX 1749 loop2: 1750 MOVW (DI), SI 1751 CMPW SI,BP 1752 JZ success 1753 ADDQ $1,DI 1754 CMPQ DI,DX 1755 JB loop2 1756 JMP fail 1757 _3_or_more: 1758 CMPQ AX, $3 1759 JA _4_or_more 1760 MOVW 1(BP), BX 1761 MOVW (BP), BP 1762 LEAQ -2(DI)(DX*1), DX 1763 loop3: 1764 MOVW (DI), SI 1765 CMPW SI,BP 1766 JZ partial_success3 1767 ADDQ $1,DI 1768 CMPQ DI,DX 1769 JB loop3 1770 JMP fail 1771 partial_success3: 1772 MOVW 1(DI), SI 1773 CMPW SI,BX 1774 JZ success 1775 ADDQ $1,DI 1776 CMPQ DI,DX 1777 JB loop3 1778 JMP fail 1779 _4_or_more: 1780 CMPQ AX, $4 1781 JA _5_or_more 1782 MOVL (BP), BP 1783 LEAQ -3(DI)(DX*1), DX 1784 loop4: 1785 MOVL (DI), SI 1786 CMPL SI,BP 1787 JZ success 1788 ADDQ $1,DI 1789 CMPQ DI,DX 1790 JB loop4 1791 JMP fail 1792 _5_or_more: 1793 CMPQ AX, $7 1794 JA _8_or_more 1795 LEAQ 1(DI)(DX*1), DX 1796 SUBQ AX, DX 1797 MOVL -4(BP)(AX*1), BX 1798 MOVL (BP), BP 1799 loop5to7: 1800 MOVL (DI), SI 1801 CMPL SI,BP 1802 JZ partial_success5to7 1803 ADDQ $1,DI 1804 CMPQ DI,DX 1805 JB loop5to7 1806 JMP fail 1807 partial_success5to7: 1808 MOVL -4(AX)(DI*1), SI 1809 CMPL SI,BX 1810 JZ success 1811 ADDQ $1,DI 1812 CMPQ DI,DX 1813 JB loop5to7 1814 JMP fail 1815 _8_or_more: 1816 CMPQ AX, $8 1817 JA _9_or_more 1818 MOVQ (BP), BP 1819 LEAQ -7(DI)(DX*1), DX 1820 loop8: 1821 MOVQ (DI), SI 1822 CMPQ SI,BP 1823 JZ success 1824 ADDQ $1,DI 1825 CMPQ DI,DX 1826 JB loop8 1827 JMP fail 1828 _9_or_more: 1829 CMPQ AX, $15 1830 JA _16_or_more 1831 LEAQ 1(DI)(DX*1), DX 1832 SUBQ AX, DX 1833 MOVQ -8(BP)(AX*1), BX 1834 MOVQ (BP), BP 1835 loop9to15: 1836 MOVQ (DI), SI 1837 CMPQ SI,BP 1838 JZ partial_success9to15 1839 ADDQ $1,DI 1840 CMPQ DI,DX 1841 JB loop9to15 1842 JMP fail 1843 partial_success9to15: 1844 MOVQ -8(AX)(DI*1), SI 1845 CMPQ SI,BX 1846 JZ success 1847 ADDQ $1,DI 1848 CMPQ DI,DX 1849 JB loop9to15 1850 JMP fail 1851 _16_or_more: 1852 CMPQ AX, $16 1853 JA _17_or_more 1854 MOVOU (BP), X1 1855 LEAQ -15(DI)(DX*1), DX 1856 loop16: 1857 MOVOU (DI), X2 1858 PCMPEQB X1, X2 1859 PMOVMSKB X2, SI 1860 CMPQ SI, $0xffff 1861 JE success 1862 ADDQ $1,DI 1863 CMPQ DI,DX 1864 JB loop16 1865 JMP fail 1866 _17_or_more: 1867 CMPQ AX, $31 1868 JA _32_or_more 1869 LEAQ 1(DI)(DX*1), DX 1870 SUBQ AX, DX 1871 MOVOU -16(BP)(AX*1), X0 1872 MOVOU (BP), X1 1873 loop17to31: 1874 MOVOU (DI), X2 1875 PCMPEQB X1,X2 1876 PMOVMSKB X2, SI 1877 CMPQ SI, $0xffff 1878 JE partial_success17to31 1879 ADDQ $1,DI 1880 CMPQ DI,DX 1881 JB loop17to31 1882 JMP fail 1883 partial_success17to31: 1884 MOVOU -16(AX)(DI*1), X3 1885 PCMPEQB X0, X3 1886 PMOVMSKB X3, SI 1887 CMPQ SI, $0xffff 1888 JE success 1889 ADDQ $1,DI 1890 CMPQ DI,DX 1891 JB loop17to31 1892 JMP fail 1893 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 1894 // So no need to check cpuid 1895 _32_or_more: 1896 CMPQ AX, $32 1897 JA _33_to_63 1898 VMOVDQU (BP), Y1 1899 LEAQ -31(DI)(DX*1), DX 1900 loop32: 1901 VMOVDQU (DI), Y2 1902 VPCMPEQB Y1, Y2, Y3 1903 VPMOVMSKB Y3, SI 1904 CMPL SI, $0xffffffff 1905 JE success_avx2 1906 ADDQ $1,DI 1907 CMPQ DI,DX 1908 JB loop32 1909 JMP fail_avx2 1910 _33_to_63: 1911 LEAQ 1(DI)(DX*1), DX 1912 SUBQ AX, DX 1913 VMOVDQU -32(BP)(AX*1), Y0 1914 VMOVDQU (BP), Y1 1915 loop33to63: 1916 VMOVDQU (DI), Y2 1917 VPCMPEQB Y1, Y2, Y3 1918 VPMOVMSKB Y3, SI 1919 CMPL SI, $0xffffffff 1920 JE partial_success33to63 1921 ADDQ $1,DI 1922 CMPQ DI,DX 1923 JB loop33to63 1924 JMP fail_avx2 1925 partial_success33to63: 1926 VMOVDQU -32(AX)(DI*1), Y3 1927 VPCMPEQB Y0, Y3, Y4 1928 VPMOVMSKB Y4, SI 1929 CMPL SI, $0xffffffff 1930 JE success_avx2 1931 ADDQ $1,DI 1932 CMPQ DI,DX 1933 JB loop33to63 1934 fail_avx2: 1935 VZEROUPPER 1936 fail: 1937 MOVQ $-1, (R11) 1938 RET 1939 success_avx2: 1940 VZEROUPPER 1941 JMP success 1942 sse42: 1943 MOVL runtime·cpuid_ecx(SB), CX 1944 ANDL $0x100000, CX 1945 JZ no_sse42 1946 CMPQ AX, $12 1947 // PCMPESTRI is slower than normal compare, 1948 // so using it makes sense only if we advance 4+ bytes per compare 1949 // This value was determined experimentally and is the ~same 1950 // on Nehalem (first with SSE42) and Haswell. 1951 JAE _9_or_more 1952 LEAQ 16(BP), SI 1953 TESTW $0xff0, SI 1954 JEQ no_sse42 1955 MOVOU (BP), X1 1956 LEAQ -15(DI)(DX*1), SI 1957 MOVQ $16, R9 1958 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 1959 loop_sse42: 1960 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 1961 // for equality (bits 2,3 are 11) 1962 // result is not masked or inverted (bits 4,5 are 00) 1963 // and corresponds to first matching byte (bit 6 is 0) 1964 PCMPESTRI $0x0c, (DI), X1 1965 // CX == 16 means no match, 1966 // CX > R9 means partial match at the end of the string, 1967 // otherwise sep is at offset CX from X1 start 1968 CMPQ CX, R9 1969 JBE sse42_success 1970 ADDQ R9, DI 1971 CMPQ DI, SI 1972 JB loop_sse42 1973 PCMPESTRI $0x0c, -1(SI), X1 1974 CMPQ CX, R9 1975 JA fail 1976 LEAQ -1(SI), DI 1977 sse42_success: 1978 ADDQ CX, DI 1979 success: 1980 SUBQ R10, DI 1981 MOVQ DI, (R11) 1982 RET 1983 1984 1985 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 1986 MOVQ s+0(FP), SI 1987 MOVQ s_len+8(FP), BX 1988 MOVB c+24(FP), AL 1989 LEAQ ret+32(FP), R8 1990 JMP runtime·indexbytebody(SB) 1991 1992 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 1993 MOVQ s+0(FP), SI 1994 MOVQ s_len+8(FP), BX 1995 MOVB c+16(FP), AL 1996 LEAQ ret+24(FP), R8 1997 JMP runtime·indexbytebody(SB) 1998 1999 // input: 2000 // SI: data 2001 // BX: data len 2002 // AL: byte sought 2003 // R8: address to put result 2004 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 2005 // Shuffle X0 around so that each byte contains 2006 // the character we're looking for. 2007 MOVD AX, X0 2008 PUNPCKLBW X0, X0 2009 PUNPCKLBW X0, X0 2010 PSHUFL $0, X0, X0 2011 2012 CMPQ BX, $16 2013 JLT small 2014 2015 MOVQ SI, DI 2016 2017 CMPQ BX, $32 2018 JA avx2 2019 sse: 2020 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 2021 JMP sseloopentry 2022 2023 sseloop: 2024 // Move the next 16-byte chunk of the data into X1. 2025 MOVOU (DI), X1 2026 // Compare bytes in X0 to X1. 2027 PCMPEQB X0, X1 2028 // Take the top bit of each byte in X1 and put the result in DX. 2029 PMOVMSKB X1, DX 2030 // Find first set bit, if any. 2031 BSFL DX, DX 2032 JNZ ssesuccess 2033 // Advance to next block. 2034 ADDQ $16, DI 2035 sseloopentry: 2036 CMPQ DI, AX 2037 JB sseloop 2038 2039 // Search the last 16-byte chunk. This chunk may overlap with the 2040 // chunks we've already searched, but that's ok. 2041 MOVQ AX, DI 2042 MOVOU (AX), X1 2043 PCMPEQB X0, X1 2044 PMOVMSKB X1, DX 2045 BSFL DX, DX 2046 JNZ ssesuccess 2047 2048 failure: 2049 MOVQ $-1, (R8) 2050 RET 2051 2052 // We've found a chunk containing the byte. 2053 // The chunk was loaded from DI. 2054 // The index of the matching byte in the chunk is DX. 2055 // The start of the data is SI. 2056 ssesuccess: 2057 SUBQ SI, DI // Compute offset of chunk within data. 2058 ADDQ DX, DI // Add offset of byte within chunk. 2059 MOVQ DI, (R8) 2060 RET 2061 2062 // handle for lengths < 16 2063 small: 2064 TESTQ BX, BX 2065 JEQ failure 2066 2067 // Check if we'll load across a page boundary. 2068 LEAQ 16(SI), AX 2069 TESTW $0xff0, AX 2070 JEQ endofpage 2071 2072 MOVOU (SI), X1 // Load data 2073 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2074 PMOVMSKB X1, DX // Move result bits to integer register. 2075 BSFL DX, DX // Find first set bit. 2076 JZ failure // No set bit, failure. 2077 CMPL DX, BX 2078 JAE failure // Match is past end of data. 2079 MOVQ DX, (R8) 2080 RET 2081 2082 endofpage: 2083 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 2084 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2085 PMOVMSKB X1, DX // Move result bits to integer register. 2086 MOVL BX, CX 2087 SHLL CX, DX 2088 SHRL $16, DX // Shift desired bits down to bottom of register. 2089 BSFL DX, DX // Find first set bit. 2090 JZ failure // No set bit, failure. 2091 MOVQ DX, (R8) 2092 RET 2093 2094 avx2: 2095 CMPB runtime·support_avx2(SB), $1 2096 JNE sse 2097 MOVD AX, X0 2098 LEAQ -32(SI)(BX*1), R11 2099 VPBROADCASTB X0, Y1 2100 avx2_loop: 2101 VMOVDQU (DI), Y2 2102 VPCMPEQB Y1, Y2, Y3 2103 VPTEST Y3, Y3 2104 JNZ avx2success 2105 ADDQ $32, DI 2106 CMPQ DI, R11 2107 JLT avx2_loop 2108 MOVQ R11, DI 2109 VMOVDQU (DI), Y2 2110 VPCMPEQB Y1, Y2, Y3 2111 VPTEST Y3, Y3 2112 JNZ avx2success 2113 VZEROUPPER 2114 MOVQ $-1, (R8) 2115 RET 2116 2117 avx2success: 2118 VPMOVMSKB Y3, DX 2119 BSFL DX, DX 2120 SUBQ SI, DI 2121 ADDQ DI, DX 2122 MOVQ DX, (R8) 2123 VZEROUPPER 2124 RET 2125 2126 TEXT bytes·Equal(SB),NOSPLIT,$0-49 2127 MOVQ a_len+8(FP), BX 2128 MOVQ b_len+32(FP), CX 2129 CMPQ BX, CX 2130 JNE eqret 2131 MOVQ a+0(FP), SI 2132 MOVQ b+24(FP), DI 2133 LEAQ ret+48(FP), AX 2134 JMP runtime·memeqbody(SB) 2135 eqret: 2136 MOVB $0, ret+48(FP) 2137 RET 2138 2139 2140 TEXT bytes·countByte(SB),NOSPLIT,$0-40 2141 MOVQ s+0(FP), SI 2142 MOVQ s_len+8(FP), BX 2143 MOVB c+24(FP), AL 2144 LEAQ ret+32(FP), R8 2145 JMP runtime·countByte(SB) 2146 2147 TEXT strings·countByte(SB),NOSPLIT,$0-32 2148 MOVQ s+0(FP), SI 2149 MOVQ s_len+8(FP), BX 2150 MOVB c+16(FP), AL 2151 LEAQ ret+24(FP), R8 2152 JMP runtime·countByte(SB) 2153 2154 // input: 2155 // SI: data 2156 // BX: data len 2157 // AL: byte sought 2158 // R8: address to put result 2159 // This requires the POPCNT instruction 2160 TEXT runtime·countByte(SB),NOSPLIT,$0 2161 // Shuffle X0 around so that each byte contains 2162 // the character we're looking for. 2163 MOVD AX, X0 2164 PUNPCKLBW X0, X0 2165 PUNPCKLBW X0, X0 2166 PSHUFL $0, X0, X0 2167 2168 CMPQ BX, $16 2169 JLT small 2170 2171 MOVQ $0, R12 // Accumulator 2172 2173 MOVQ SI, DI 2174 2175 CMPQ BX, $32 2176 JA avx2 2177 sse: 2178 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 2179 JMP sseloopentry 2180 2181 sseloop: 2182 // Move the next 16-byte chunk of the data into X1. 2183 MOVOU (DI), X1 2184 // Compare bytes in X0 to X1. 2185 PCMPEQB X0, X1 2186 // Take the top bit of each byte in X1 and put the result in DX. 2187 PMOVMSKB X1, DX 2188 // Count number of matching bytes 2189 POPCNTL DX, DX 2190 // Accumulate into R12 2191 ADDQ DX, R12 2192 // Advance to next block. 2193 ADDQ $16, DI 2194 sseloopentry: 2195 CMPQ DI, AX 2196 JBE sseloop 2197 2198 // Get the number of bytes to consider in the last 16 bytes 2199 ANDQ $15, BX 2200 JZ end 2201 2202 // Create mask to ignore overlap between previous 16 byte block 2203 // and the next. 2204 MOVQ $16,CX 2205 SUBQ BX, CX 2206 MOVQ $0xFFFF, R10 2207 SARQ CL, R10 2208 SALQ CL, R10 2209 2210 // Process the last 16-byte chunk. This chunk may overlap with the 2211 // chunks we've already searched so we need to mask part of it. 2212 MOVOU (AX), X1 2213 PCMPEQB X0, X1 2214 PMOVMSKB X1, DX 2215 // Apply mask 2216 ANDQ R10, DX 2217 POPCNTL DX, DX 2218 ADDQ DX, R12 2219 end: 2220 MOVQ R12, (R8) 2221 RET 2222 2223 // handle for lengths < 16 2224 small: 2225 TESTQ BX, BX 2226 JEQ endzero 2227 2228 // Check if we'll load across a page boundary. 2229 LEAQ 16(SI), AX 2230 TESTW $0xff0, AX 2231 JEQ endofpage 2232 2233 // We must ignore high bytes as they aren't part of our slice. 2234 // Create mask. 2235 MOVB BX, CX 2236 MOVQ $1, R10 2237 SALQ CL, R10 2238 SUBQ $1, R10 2239 2240 // Load data 2241 MOVOU (SI), X1 2242 // Compare target byte with each byte in data. 2243 PCMPEQB X0, X1 2244 // Move result bits to integer register. 2245 PMOVMSKB X1, DX 2246 // Apply mask 2247 ANDQ R10, DX 2248 POPCNTL DX, DX 2249 // Directly return DX, we don't need to accumulate 2250 // since we have <16 bytes. 2251 MOVQ DX, (R8) 2252 RET 2253 endzero: 2254 MOVQ $0, (R8) 2255 RET 2256 2257 endofpage: 2258 // We must ignore low bytes as they aren't part of our slice. 2259 MOVQ $16,CX 2260 SUBQ BX, CX 2261 MOVQ $0xFFFF, R10 2262 SARQ CL, R10 2263 SALQ CL, R10 2264 2265 // Load data into the high end of X1. 2266 MOVOU -16(SI)(BX*1), X1 2267 // Compare target byte with each byte in data. 2268 PCMPEQB X0, X1 2269 // Move result bits to integer register. 2270 PMOVMSKB X1, DX 2271 // Apply mask 2272 ANDQ R10, DX 2273 // Directly return DX, we don't need to accumulate 2274 // since we have <16 bytes. 2275 POPCNTL DX, DX 2276 MOVQ DX, (R8) 2277 RET 2278 2279 avx2: 2280 CMPB runtime·support_avx2(SB), $1 2281 JNE sse 2282 MOVD AX, X0 2283 LEAQ -32(SI)(BX*1), R11 2284 VPBROADCASTB X0, Y1 2285 avx2_loop: 2286 VMOVDQU (DI), Y2 2287 VPCMPEQB Y1, Y2, Y3 2288 VPMOVMSKB Y3, DX 2289 POPCNTL DX, DX 2290 ADDQ DX, R12 2291 ADDQ $32, DI 2292 CMPQ DI, R11 2293 JLE avx2_loop 2294 2295 // If last block is already processed, 2296 // skip to the end. 2297 CMPQ DI, R11 2298 JEQ endavx 2299 2300 // Load address of the last 32 bytes. 2301 // There is an overlap with the previous block. 2302 MOVQ R11, DI 2303 VMOVDQU (DI), Y2 2304 VPCMPEQB Y1, Y2, Y3 2305 VPMOVMSKB Y3, DX 2306 // Exit AVX mode. 2307 VZEROUPPER 2308 2309 // Create mask to ignore overlap between previous 32 byte block 2310 // and the next. 2311 ANDQ $31, BX 2312 MOVQ $32,CX 2313 SUBQ BX, CX 2314 MOVQ $0xFFFFFFFF, R10 2315 SARQ CL, R10 2316 SALQ CL, R10 2317 // Apply mask 2318 ANDQ R10, DX 2319 POPCNTL DX, DX 2320 ADDQ DX, R12 2321 MOVQ R12, (R8) 2322 RET 2323 endavx: 2324 // Exit AVX mode. 2325 VZEROUPPER 2326 MOVQ R12, (R8) 2327 RET 2328 2329 TEXT runtime·return0(SB), NOSPLIT, $0 2330 MOVL $0, AX 2331 RET 2332 2333 2334 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 2335 // Must obey the gcc calling convention. 2336 TEXT _cgo_topofstack(SB),NOSPLIT,$0 2337 get_tls(CX) 2338 MOVQ g(CX), AX 2339 MOVQ g_m(AX), AX 2340 MOVQ m_curg(AX), AX 2341 MOVQ (g_stack+stack_hi)(AX), AX 2342 RET 2343 2344 // The top-most function running on a goroutine 2345 // returns to goexit+PCQuantum. 2346 TEXT runtime·goexit(SB),NOSPLIT,$0-0 2347 BYTE $0x90 // NOP 2348 CALL runtime·goexit1(SB) // does not return 2349 // traceback from goexit1 must hit code range of goexit 2350 BYTE $0x90 // NOP 2351 2352 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8 2353 MOVQ addr+0(FP), AX 2354 PREFETCHT0 (AX) 2355 RET 2356 2357 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8 2358 MOVQ addr+0(FP), AX 2359 PREFETCHT1 (AX) 2360 RET 2361 2362 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8 2363 MOVQ addr+0(FP), AX 2364 PREFETCHT2 (AX) 2365 RET 2366 2367 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8 2368 MOVQ addr+0(FP), AX 2369 PREFETCHNTA (AX) 2370 RET 2371 2372 // This is called from .init_array and follows the platform, not Go, ABI. 2373 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 2374 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 2375 MOVQ runtime·lastmoduledatap(SB), AX 2376 MOVQ DI, moduledata_next(AX) 2377 MOVQ DI, runtime·lastmoduledatap(SB) 2378 POPQ R15 2379 RET