github.com/c9s/go@v0.0.0-20180120015821-984e81f64e0c/src/runtime/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 // _rt0_amd64 is common startup code for most amd64 systems when using 11 // internal linking. This is the entry point for the program from the 12 // kernel for an ordinary -buildmode=exe program. The stack holds the 13 // number of arguments and the C-style argv. 14 TEXT _rt0_amd64(SB),NOSPLIT,$-8 15 MOVQ 0(SP), DI // argc 16 LEAQ 8(SP), SI // argv 17 JMP runtime·rt0_go(SB) 18 19 // main is common startup code for most amd64 systems when using 20 // external linking. The C startup code will call the symbol "main" 21 // passing argc and argv in the usual C ABI registers DI and SI. 22 TEXT main(SB),NOSPLIT,$-8 23 JMP runtime·rt0_go(SB) 24 25 // _rt0_amd64_lib is common startup code for most amd64 systems when 26 // using -buildmode=c-archive or -buildmode=c-shared. The linker will 27 // arrange to invoke this function as a global constructor (for 28 // c-archive) or when the shared library is loaded (for c-shared). 29 // We expect argc and argv to be passed in the usual C ABI registers 30 // DI and SI. 31 TEXT _rt0_amd64_lib(SB),NOSPLIT,$0x50 32 // Align stack per ELF ABI requirements. 33 MOVQ SP, AX 34 ANDQ $~15, SP 35 // Save C ABI callee-saved registers, as caller may need them. 36 MOVQ BX, 0x10(SP) 37 MOVQ BP, 0x18(SP) 38 MOVQ R12, 0x20(SP) 39 MOVQ R13, 0x28(SP) 40 MOVQ R14, 0x30(SP) 41 MOVQ R15, 0x38(SP) 42 MOVQ AX, 0x40(SP) 43 44 MOVQ DI, _rt0_amd64_lib_argc<>(SB) 45 MOVQ SI, _rt0_amd64_lib_argv<>(SB) 46 47 // Synchronous initialization. 48 CALL runtime·libpreinit(SB) 49 50 // Create a new thread to finish Go runtime initialization. 51 MOVQ _cgo_sys_thread_create(SB), AX 52 TESTQ AX, AX 53 JZ nocgo 54 MOVQ $_rt0_amd64_lib_go(SB), DI 55 MOVQ $0, SI 56 CALL AX 57 JMP restore 58 59 nocgo: 60 MOVQ $0x800000, 0(SP) // stacksize 61 MOVQ $_rt0_amd64_lib_go(SB), AX 62 MOVQ AX, 8(SP) // fn 63 CALL runtime·newosproc0(SB) 64 65 restore: 66 MOVQ 0x10(SP), BX 67 MOVQ 0x18(SP), BP 68 MOVQ 0x20(SP), R12 69 MOVQ 0x28(SP), R13 70 MOVQ 0x30(SP), R14 71 MOVQ 0x38(SP), R15 72 MOVQ 0x40(SP), SP 73 RET 74 75 // _rt0_amd64_lib_go initializes the Go runtime. 76 // This is started in a separate thread by _rt0_amd64_lib. 77 TEXT _rt0_amd64_lib_go(SB),NOSPLIT,$0 78 MOVQ _rt0_amd64_lib_argc<>(SB), DI 79 MOVQ _rt0_amd64_lib_argv<>(SB), SI 80 JMP runtime·rt0_go(SB) 81 82 DATA _rt0_amd64_lib_argc<>(SB)/8, $0 83 GLOBL _rt0_amd64_lib_argc<>(SB),NOPTR, $8 84 DATA _rt0_amd64_lib_argv<>(SB)/8, $0 85 GLOBL _rt0_amd64_lib_argv<>(SB),NOPTR, $8 86 87 TEXT runtime·rt0_go(SB),NOSPLIT,$0 88 // copy arguments forward on an even stack 89 MOVQ DI, AX // argc 90 MOVQ SI, BX // argv 91 SUBQ $(4*8+7), SP // 2args 2auto 92 ANDQ $~15, SP 93 MOVQ AX, 16(SP) 94 MOVQ BX, 24(SP) 95 96 // create istack out of the given (operating system) stack. 97 // _cgo_init may update stackguard. 98 MOVQ $runtime·g0(SB), DI 99 LEAQ (-64*1024+104)(SP), BX 100 MOVQ BX, g_stackguard0(DI) 101 MOVQ BX, g_stackguard1(DI) 102 MOVQ BX, (g_stack+stack_lo)(DI) 103 MOVQ SP, (g_stack+stack_hi)(DI) 104 105 // find out information about the processor we're on 106 MOVL $0, AX 107 CPUID 108 MOVL AX, SI 109 CMPL AX, $0 110 JE nocpuinfo 111 112 // Figure out how to serialize RDTSC. 113 // On Intel processors LFENCE is enough. AMD requires MFENCE. 114 // Don't know about the rest, so let's do MFENCE. 115 CMPL BX, $0x756E6547 // "Genu" 116 JNE notintel 117 CMPL DX, $0x49656E69 // "ineI" 118 JNE notintel 119 CMPL CX, $0x6C65746E // "ntel" 120 JNE notintel 121 MOVB $1, runtime·isIntel(SB) 122 MOVB $1, runtime·lfenceBeforeRdtsc(SB) 123 notintel: 124 125 // Load EAX=1 cpuid flags 126 MOVL $1, AX 127 CPUID 128 MOVL AX, runtime·processorVersionInfo(SB) 129 130 TESTL $(1<<26), DX // SSE2 131 SETNE runtime·support_sse2(SB) 132 133 TESTL $(1<<9), CX // SSSE3 134 SETNE runtime·support_ssse3(SB) 135 136 TESTL $(1<<19), CX // SSE4.1 137 SETNE runtime·support_sse41(SB) 138 139 TESTL $(1<<20), CX // SSE4.2 140 SETNE runtime·support_sse42(SB) 141 142 TESTL $(1<<23), CX // POPCNT 143 SETNE runtime·support_popcnt(SB) 144 145 TESTL $(1<<25), CX // AES 146 SETNE runtime·support_aes(SB) 147 148 TESTL $(1<<27), CX // OSXSAVE 149 SETNE runtime·support_osxsave(SB) 150 151 // If OS support for XMM and YMM is not present 152 // support_avx will be set back to false later. 153 TESTL $(1<<28), CX // AVX 154 SETNE runtime·support_avx(SB) 155 156 eax7: 157 // Load EAX=7/ECX=0 cpuid flags 158 CMPL SI, $7 159 JLT osavx 160 MOVL $7, AX 161 MOVL $0, CX 162 CPUID 163 164 TESTL $(1<<3), BX // BMI1 165 SETNE runtime·support_bmi1(SB) 166 167 // If OS support for XMM and YMM is not present 168 // support_avx2 will be set back to false later. 169 TESTL $(1<<5), BX 170 SETNE runtime·support_avx2(SB) 171 172 TESTL $(1<<8), BX // BMI2 173 SETNE runtime·support_bmi2(SB) 174 175 TESTL $(1<<9), BX // ERMS 176 SETNE runtime·support_erms(SB) 177 178 osavx: 179 CMPB runtime·support_osxsave(SB), $1 180 JNE noavx 181 MOVL $0, CX 182 // For XGETBV, OSXSAVE bit is required and sufficient 183 XGETBV 184 ANDL $6, AX 185 CMPL AX, $6 // Check for OS support of XMM and YMM registers. 186 JE nocpuinfo 187 noavx: 188 MOVB $0, runtime·support_avx(SB) 189 MOVB $0, runtime·support_avx2(SB) 190 191 nocpuinfo: 192 // if there is an _cgo_init, call it. 193 MOVQ _cgo_init(SB), AX 194 TESTQ AX, AX 195 JZ needtls 196 // g0 already in DI 197 MOVQ DI, CX // Win64 uses CX for first parameter 198 MOVQ $setg_gcc<>(SB), SI 199 CALL AX 200 201 // update stackguard after _cgo_init 202 MOVQ $runtime·g0(SB), CX 203 MOVQ (g_stack+stack_lo)(CX), AX 204 ADDQ $const__StackGuard, AX 205 MOVQ AX, g_stackguard0(CX) 206 MOVQ AX, g_stackguard1(CX) 207 208 #ifndef GOOS_windows 209 JMP ok 210 #endif 211 needtls: 212 #ifdef GOOS_plan9 213 // skip TLS setup on Plan 9 214 JMP ok 215 #endif 216 #ifdef GOOS_solaris 217 // skip TLS setup on Solaris 218 JMP ok 219 #endif 220 221 LEAQ runtime·m0+m_tls(SB), DI 222 CALL runtime·settls(SB) 223 224 // store through it, to make sure it works 225 get_tls(BX) 226 MOVQ $0x123, g(BX) 227 MOVQ runtime·m0+m_tls(SB), AX 228 CMPQ AX, $0x123 229 JEQ 2(PC) 230 MOVL AX, 0 // abort 231 ok: 232 // set the per-goroutine and per-mach "registers" 233 get_tls(BX) 234 LEAQ runtime·g0(SB), CX 235 MOVQ CX, g(BX) 236 LEAQ runtime·m0(SB), AX 237 238 // save m->g0 = g0 239 MOVQ CX, m_g0(AX) 240 // save m0 to g0->m 241 MOVQ AX, g_m(CX) 242 243 CLD // convention is D is always left cleared 244 CALL runtime·check(SB) 245 246 MOVL 16(SP), AX // copy argc 247 MOVL AX, 0(SP) 248 MOVQ 24(SP), AX // copy argv 249 MOVQ AX, 8(SP) 250 CALL runtime·args(SB) 251 CALL runtime·osinit(SB) 252 CALL runtime·schedinit(SB) 253 254 // create a new goroutine to start program 255 MOVQ $runtime·mainPC(SB), AX // entry 256 PUSHQ AX 257 PUSHQ $0 // arg size 258 CALL runtime·newproc(SB) 259 POPQ AX 260 POPQ AX 261 262 // start this M 263 CALL runtime·mstart(SB) 264 265 MOVL $0xf1, 0xf1 // crash 266 RET 267 268 DATA runtime·mainPC+0(SB)/8,$runtime·main(SB) 269 GLOBL runtime·mainPC(SB),RODATA,$8 270 271 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 272 BYTE $0xcc 273 RET 274 275 TEXT runtime·asminit(SB),NOSPLIT,$0-0 276 // No per-thread init. 277 RET 278 279 /* 280 * go-routine 281 */ 282 283 // void gosave(Gobuf*) 284 // save state in Gobuf; setjmp 285 TEXT runtime·gosave(SB), NOSPLIT, $0-8 286 MOVQ buf+0(FP), AX // gobuf 287 LEAQ buf+0(FP), BX // caller's SP 288 MOVQ BX, gobuf_sp(AX) 289 MOVQ 0(SP), BX // caller's PC 290 MOVQ BX, gobuf_pc(AX) 291 MOVQ $0, gobuf_ret(AX) 292 MOVQ BP, gobuf_bp(AX) 293 // Assert ctxt is zero. See func save. 294 MOVQ gobuf_ctxt(AX), BX 295 TESTQ BX, BX 296 JZ 2(PC) 297 CALL runtime·badctxt(SB) 298 get_tls(CX) 299 MOVQ g(CX), BX 300 MOVQ BX, gobuf_g(AX) 301 RET 302 303 // void gogo(Gobuf*) 304 // restore state from Gobuf; longjmp 305 TEXT runtime·gogo(SB), NOSPLIT, $16-8 306 MOVQ buf+0(FP), BX // gobuf 307 MOVQ gobuf_g(BX), DX 308 MOVQ 0(DX), CX // make sure g != nil 309 get_tls(CX) 310 MOVQ DX, g(CX) 311 MOVQ gobuf_sp(BX), SP // restore SP 312 MOVQ gobuf_ret(BX), AX 313 MOVQ gobuf_ctxt(BX), DX 314 MOVQ gobuf_bp(BX), BP 315 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 316 MOVQ $0, gobuf_ret(BX) 317 MOVQ $0, gobuf_ctxt(BX) 318 MOVQ $0, gobuf_bp(BX) 319 MOVQ gobuf_pc(BX), BX 320 JMP BX 321 322 // func mcall(fn func(*g)) 323 // Switch to m->g0's stack, call fn(g). 324 // Fn must never return. It should gogo(&g->sched) 325 // to keep running g. 326 TEXT runtime·mcall(SB), NOSPLIT, $0-8 327 MOVQ fn+0(FP), DI 328 329 get_tls(CX) 330 MOVQ g(CX), AX // save state in g->sched 331 MOVQ 0(SP), BX // caller's PC 332 MOVQ BX, (g_sched+gobuf_pc)(AX) 333 LEAQ fn+0(FP), BX // caller's SP 334 MOVQ BX, (g_sched+gobuf_sp)(AX) 335 MOVQ AX, (g_sched+gobuf_g)(AX) 336 MOVQ BP, (g_sched+gobuf_bp)(AX) 337 338 // switch to m->g0 & its stack, call fn 339 MOVQ g(CX), BX 340 MOVQ g_m(BX), BX 341 MOVQ m_g0(BX), SI 342 CMPQ SI, AX // if g == m->g0 call badmcall 343 JNE 3(PC) 344 MOVQ $runtime·badmcall(SB), AX 345 JMP AX 346 MOVQ SI, g(CX) // g = m->g0 347 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 348 PUSHQ AX 349 MOVQ DI, DX 350 MOVQ 0(DI), DI 351 CALL DI 352 POPQ AX 353 MOVQ $runtime·badmcall2(SB), AX 354 JMP AX 355 RET 356 357 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 358 // of the G stack. We need to distinguish the routine that 359 // lives at the bottom of the G stack from the one that lives 360 // at the top of the system stack because the one at the top of 361 // the system stack terminates the stack walk (see topofstack()). 362 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 363 RET 364 365 // func systemstack(fn func()) 366 TEXT runtime·systemstack(SB), NOSPLIT, $0-8 367 MOVQ fn+0(FP), DI // DI = fn 368 get_tls(CX) 369 MOVQ g(CX), AX // AX = g 370 MOVQ g_m(AX), BX // BX = m 371 372 MOVQ m_gsignal(BX), DX // DX = gsignal 373 CMPQ AX, DX 374 JEQ noswitch 375 376 MOVQ m_g0(BX), DX // DX = g0 377 CMPQ AX, DX 378 JEQ noswitch 379 380 MOVQ m_curg(BX), R8 381 CMPQ AX, R8 382 JEQ switch 383 384 // Bad: g is not gsignal, not g0, not curg. What is it? 385 MOVQ $runtime·badsystemstack(SB), AX 386 CALL AX 387 388 switch: 389 // save our state in g->sched. Pretend to 390 // be systemstack_switch if the G stack is scanned. 391 MOVQ $runtime·systemstack_switch(SB), SI 392 MOVQ SI, (g_sched+gobuf_pc)(AX) 393 MOVQ SP, (g_sched+gobuf_sp)(AX) 394 MOVQ AX, (g_sched+gobuf_g)(AX) 395 MOVQ BP, (g_sched+gobuf_bp)(AX) 396 397 // switch to g0 398 MOVQ DX, g(CX) 399 MOVQ (g_sched+gobuf_sp)(DX), BX 400 // make it look like mstart called systemstack on g0, to stop traceback 401 SUBQ $8, BX 402 MOVQ $runtime·mstart(SB), DX 403 MOVQ DX, 0(BX) 404 MOVQ BX, SP 405 406 // call target function 407 MOVQ DI, DX 408 MOVQ 0(DI), DI 409 CALL DI 410 411 // switch back to g 412 get_tls(CX) 413 MOVQ g(CX), AX 414 MOVQ g_m(AX), BX 415 MOVQ m_curg(BX), AX 416 MOVQ AX, g(CX) 417 MOVQ (g_sched+gobuf_sp)(AX), SP 418 MOVQ $0, (g_sched+gobuf_sp)(AX) 419 RET 420 421 noswitch: 422 // already on m stack; tail call the function 423 // Using a tail call here cleans up tracebacks since we won't stop 424 // at an intermediate systemstack. 425 MOVQ DI, DX 426 MOVQ 0(DI), DI 427 JMP DI 428 429 /* 430 * support for morestack 431 */ 432 433 // Called during function prolog when more stack is needed. 434 // 435 // The traceback routines see morestack on a g0 as being 436 // the top of a stack (for example, morestack calling newstack 437 // calling the scheduler calling newm calling gc), so we must 438 // record an argument size. For that purpose, it has no arguments. 439 TEXT runtime·morestack(SB),NOSPLIT,$0-0 440 // Cannot grow scheduler stack (m->g0). 441 get_tls(CX) 442 MOVQ g(CX), BX 443 MOVQ g_m(BX), BX 444 MOVQ m_g0(BX), SI 445 CMPQ g(CX), SI 446 JNE 3(PC) 447 CALL runtime·badmorestackg0(SB) 448 INT $3 449 450 // Cannot grow signal stack (m->gsignal). 451 MOVQ m_gsignal(BX), SI 452 CMPQ g(CX), SI 453 JNE 3(PC) 454 CALL runtime·badmorestackgsignal(SB) 455 INT $3 456 457 // Called from f. 458 // Set m->morebuf to f's caller. 459 MOVQ 8(SP), AX // f's caller's PC 460 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 461 LEAQ 16(SP), AX // f's caller's SP 462 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 463 get_tls(CX) 464 MOVQ g(CX), SI 465 MOVQ SI, (m_morebuf+gobuf_g)(BX) 466 467 // Set g->sched to context in f. 468 MOVQ 0(SP), AX // f's PC 469 MOVQ AX, (g_sched+gobuf_pc)(SI) 470 MOVQ SI, (g_sched+gobuf_g)(SI) 471 LEAQ 8(SP), AX // f's SP 472 MOVQ AX, (g_sched+gobuf_sp)(SI) 473 MOVQ BP, (g_sched+gobuf_bp)(SI) 474 MOVQ DX, (g_sched+gobuf_ctxt)(SI) 475 476 // Call newstack on m->g0's stack. 477 MOVQ m_g0(BX), BX 478 MOVQ BX, g(CX) 479 MOVQ (g_sched+gobuf_sp)(BX), SP 480 CALL runtime·newstack(SB) 481 MOVQ $0, 0x1003 // crash if newstack returns 482 RET 483 484 // morestack but not preserving ctxt. 485 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 486 MOVL $0, DX 487 JMP runtime·morestack(SB) 488 489 // reflectcall: call a function with the given argument list 490 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 491 // we don't have variable-sized frames, so we use a small number 492 // of constant-sized-frame functions to encode a few bits of size in the pc. 493 // Caution: ugly multiline assembly macros in your future! 494 495 #define DISPATCH(NAME,MAXSIZE) \ 496 CMPQ CX, $MAXSIZE; \ 497 JA 3(PC); \ 498 MOVQ $NAME(SB), AX; \ 499 JMP AX 500 // Note: can't just "JMP NAME(SB)" - bad inlining results. 501 502 TEXT reflect·call(SB), NOSPLIT, $0-0 503 JMP ·reflectcall(SB) 504 505 TEXT ·reflectcall(SB), NOSPLIT, $0-32 506 MOVLQZX argsize+24(FP), CX 507 DISPATCH(runtime·call32, 32) 508 DISPATCH(runtime·call64, 64) 509 DISPATCH(runtime·call128, 128) 510 DISPATCH(runtime·call256, 256) 511 DISPATCH(runtime·call512, 512) 512 DISPATCH(runtime·call1024, 1024) 513 DISPATCH(runtime·call2048, 2048) 514 DISPATCH(runtime·call4096, 4096) 515 DISPATCH(runtime·call8192, 8192) 516 DISPATCH(runtime·call16384, 16384) 517 DISPATCH(runtime·call32768, 32768) 518 DISPATCH(runtime·call65536, 65536) 519 DISPATCH(runtime·call131072, 131072) 520 DISPATCH(runtime·call262144, 262144) 521 DISPATCH(runtime·call524288, 524288) 522 DISPATCH(runtime·call1048576, 1048576) 523 DISPATCH(runtime·call2097152, 2097152) 524 DISPATCH(runtime·call4194304, 4194304) 525 DISPATCH(runtime·call8388608, 8388608) 526 DISPATCH(runtime·call16777216, 16777216) 527 DISPATCH(runtime·call33554432, 33554432) 528 DISPATCH(runtime·call67108864, 67108864) 529 DISPATCH(runtime·call134217728, 134217728) 530 DISPATCH(runtime·call268435456, 268435456) 531 DISPATCH(runtime·call536870912, 536870912) 532 DISPATCH(runtime·call1073741824, 1073741824) 533 MOVQ $runtime·badreflectcall(SB), AX 534 JMP AX 535 536 #define CALLFN(NAME,MAXSIZE) \ 537 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 538 NO_LOCAL_POINTERS; \ 539 /* copy arguments to stack */ \ 540 MOVQ argptr+16(FP), SI; \ 541 MOVLQZX argsize+24(FP), CX; \ 542 MOVQ SP, DI; \ 543 REP;MOVSB; \ 544 /* call function */ \ 545 MOVQ f+8(FP), DX; \ 546 PCDATA $PCDATA_StackMapIndex, $0; \ 547 CALL (DX); \ 548 /* copy return values back */ \ 549 MOVQ argtype+0(FP), DX; \ 550 MOVQ argptr+16(FP), DI; \ 551 MOVLQZX argsize+24(FP), CX; \ 552 MOVLQZX retoffset+28(FP), BX; \ 553 MOVQ SP, SI; \ 554 ADDQ BX, DI; \ 555 ADDQ BX, SI; \ 556 SUBQ BX, CX; \ 557 CALL callRet<>(SB); \ 558 RET 559 560 // callRet copies return values back at the end of call*. This is a 561 // separate function so it can allocate stack space for the arguments 562 // to reflectcallmove. It does not follow the Go ABI; it expects its 563 // arguments in registers. 564 TEXT callRet<>(SB), NOSPLIT, $32-0 565 NO_LOCAL_POINTERS 566 MOVQ DX, 0(SP) 567 MOVQ DI, 8(SP) 568 MOVQ SI, 16(SP) 569 MOVQ CX, 24(SP) 570 CALL runtime·reflectcallmove(SB) 571 RET 572 573 CALLFN(·call32, 32) 574 CALLFN(·call64, 64) 575 CALLFN(·call128, 128) 576 CALLFN(·call256, 256) 577 CALLFN(·call512, 512) 578 CALLFN(·call1024, 1024) 579 CALLFN(·call2048, 2048) 580 CALLFN(·call4096, 4096) 581 CALLFN(·call8192, 8192) 582 CALLFN(·call16384, 16384) 583 CALLFN(·call32768, 32768) 584 CALLFN(·call65536, 65536) 585 CALLFN(·call131072, 131072) 586 CALLFN(·call262144, 262144) 587 CALLFN(·call524288, 524288) 588 CALLFN(·call1048576, 1048576) 589 CALLFN(·call2097152, 2097152) 590 CALLFN(·call4194304, 4194304) 591 CALLFN(·call8388608, 8388608) 592 CALLFN(·call16777216, 16777216) 593 CALLFN(·call33554432, 33554432) 594 CALLFN(·call67108864, 67108864) 595 CALLFN(·call134217728, 134217728) 596 CALLFN(·call268435456, 268435456) 597 CALLFN(·call536870912, 536870912) 598 CALLFN(·call1073741824, 1073741824) 599 600 TEXT runtime·procyield(SB),NOSPLIT,$0-0 601 MOVL cycles+0(FP), AX 602 again: 603 PAUSE 604 SUBL $1, AX 605 JNZ again 606 RET 607 608 609 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 610 // Stores are already ordered on x86, so this is just a 611 // compile barrier. 612 RET 613 614 // void jmpdefer(fn, sp); 615 // called from deferreturn. 616 // 1. pop the caller 617 // 2. sub 5 bytes from the callers return 618 // 3. jmp to the argument 619 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16 620 MOVQ fv+0(FP), DX // fn 621 MOVQ argp+8(FP), BX // caller sp 622 LEAQ -8(BX), SP // caller sp after CALL 623 MOVQ -8(SP), BP // restore BP as if deferreturn returned (harmless if framepointers not in use) 624 SUBQ $5, (SP) // return to CALL again 625 MOVQ 0(DX), BX 626 JMP BX // but first run the deferred function 627 628 // Save state of caller into g->sched. Smashes R8, R9. 629 TEXT gosave<>(SB),NOSPLIT,$0 630 get_tls(R8) 631 MOVQ g(R8), R8 632 MOVQ 0(SP), R9 633 MOVQ R9, (g_sched+gobuf_pc)(R8) 634 LEAQ 8(SP), R9 635 MOVQ R9, (g_sched+gobuf_sp)(R8) 636 MOVQ $0, (g_sched+gobuf_ret)(R8) 637 MOVQ BP, (g_sched+gobuf_bp)(R8) 638 // Assert ctxt is zero. See func save. 639 MOVQ (g_sched+gobuf_ctxt)(R8), R9 640 TESTQ R9, R9 641 JZ 2(PC) 642 CALL runtime·badctxt(SB) 643 RET 644 645 // func asmcgocall(fn, arg unsafe.Pointer) int32 646 // Call fn(arg) on the scheduler stack, 647 // aligned appropriately for the gcc ABI. 648 // See cgocall.go for more details. 649 TEXT ·asmcgocall(SB),NOSPLIT,$0-20 650 MOVQ fn+0(FP), AX 651 MOVQ arg+8(FP), BX 652 653 MOVQ SP, DX 654 655 // Figure out if we need to switch to m->g0 stack. 656 // We get called to create new OS threads too, and those 657 // come in on the m->g0 stack already. 658 get_tls(CX) 659 MOVQ g(CX), R8 660 CMPQ R8, $0 661 JEQ nosave 662 MOVQ g_m(R8), R8 663 MOVQ m_g0(R8), SI 664 MOVQ g(CX), DI 665 CMPQ SI, DI 666 JEQ nosave 667 MOVQ m_gsignal(R8), SI 668 CMPQ SI, DI 669 JEQ nosave 670 671 // Switch to system stack. 672 MOVQ m_g0(R8), SI 673 CALL gosave<>(SB) 674 MOVQ SI, g(CX) 675 MOVQ (g_sched+gobuf_sp)(SI), SP 676 677 // Now on a scheduling stack (a pthread-created stack). 678 // Make sure we have enough room for 4 stack-backed fast-call 679 // registers as per windows amd64 calling convention. 680 SUBQ $64, SP 681 ANDQ $~15, SP // alignment for gcc ABI 682 MOVQ DI, 48(SP) // save g 683 MOVQ (g_stack+stack_hi)(DI), DI 684 SUBQ DX, DI 685 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 686 MOVQ BX, DI // DI = first argument in AMD64 ABI 687 MOVQ BX, CX // CX = first argument in Win64 688 CALL AX 689 690 // Restore registers, g, stack pointer. 691 get_tls(CX) 692 MOVQ 48(SP), DI 693 MOVQ (g_stack+stack_hi)(DI), SI 694 SUBQ 40(SP), SI 695 MOVQ DI, g(CX) 696 MOVQ SI, SP 697 698 MOVL AX, ret+16(FP) 699 RET 700 701 nosave: 702 // Running on a system stack, perhaps even without a g. 703 // Having no g can happen during thread creation or thread teardown 704 // (see needm/dropm on Solaris, for example). 705 // This code is like the above sequence but without saving/restoring g 706 // and without worrying about the stack moving out from under us 707 // (because we're on a system stack, not a goroutine stack). 708 // The above code could be used directly if already on a system stack, 709 // but then the only path through this code would be a rare case on Solaris. 710 // Using this code for all "already on system stack" calls exercises it more, 711 // which should help keep it correct. 712 SUBQ $64, SP 713 ANDQ $~15, SP 714 MOVQ $0, 48(SP) // where above code stores g, in case someone looks during debugging 715 MOVQ DX, 40(SP) // save original stack pointer 716 MOVQ BX, DI // DI = first argument in AMD64 ABI 717 MOVQ BX, CX // CX = first argument in Win64 718 CALL AX 719 MOVQ 40(SP), SI // restore original stack pointer 720 MOVQ SI, SP 721 MOVL AX, ret+16(FP) 722 RET 723 724 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt) 725 // Turn the fn into a Go func (by taking its address) and call 726 // cgocallback_gofunc. 727 TEXT runtime·cgocallback(SB),NOSPLIT,$32-32 728 LEAQ fn+0(FP), AX 729 MOVQ AX, 0(SP) 730 MOVQ frame+8(FP), AX 731 MOVQ AX, 8(SP) 732 MOVQ framesize+16(FP), AX 733 MOVQ AX, 16(SP) 734 MOVQ ctxt+24(FP), AX 735 MOVQ AX, 24(SP) 736 MOVQ $runtime·cgocallback_gofunc(SB), AX 737 CALL AX 738 RET 739 740 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt) 741 // See cgocall.go for more details. 742 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32 743 NO_LOCAL_POINTERS 744 745 // If g is nil, Go did not create the current thread. 746 // Call needm to obtain one m for temporary use. 747 // In this case, we're running on the thread stack, so there's 748 // lots of space, but the linker doesn't know. Hide the call from 749 // the linker analysis by using an indirect call through AX. 750 get_tls(CX) 751 #ifdef GOOS_windows 752 MOVL $0, BX 753 CMPQ CX, $0 754 JEQ 2(PC) 755 #endif 756 MOVQ g(CX), BX 757 CMPQ BX, $0 758 JEQ needm 759 MOVQ g_m(BX), BX 760 MOVQ BX, R8 // holds oldm until end of function 761 JMP havem 762 needm: 763 MOVQ $0, 0(SP) 764 MOVQ $runtime·needm(SB), AX 765 CALL AX 766 MOVQ 0(SP), R8 767 get_tls(CX) 768 MOVQ g(CX), BX 769 MOVQ g_m(BX), BX 770 771 // Set m->sched.sp = SP, so that if a panic happens 772 // during the function we are about to execute, it will 773 // have a valid SP to run on the g0 stack. 774 // The next few lines (after the havem label) 775 // will save this SP onto the stack and then write 776 // the same SP back to m->sched.sp. That seems redundant, 777 // but if an unrecovered panic happens, unwindm will 778 // restore the g->sched.sp from the stack location 779 // and then systemstack will try to use it. If we don't set it here, 780 // that restored SP will be uninitialized (typically 0) and 781 // will not be usable. 782 MOVQ m_g0(BX), SI 783 MOVQ SP, (g_sched+gobuf_sp)(SI) 784 785 havem: 786 // Now there's a valid m, and we're running on its m->g0. 787 // Save current m->g0->sched.sp on stack and then set it to SP. 788 // Save current sp in m->g0->sched.sp in preparation for 789 // switch back to m->curg stack. 790 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 791 MOVQ m_g0(BX), SI 792 MOVQ (g_sched+gobuf_sp)(SI), AX 793 MOVQ AX, 0(SP) 794 MOVQ SP, (g_sched+gobuf_sp)(SI) 795 796 // Switch to m->curg stack and call runtime.cgocallbackg. 797 // Because we are taking over the execution of m->curg 798 // but *not* resuming what had been running, we need to 799 // save that information (m->curg->sched) so we can restore it. 800 // We can restore m->curg->sched.sp easily, because calling 801 // runtime.cgocallbackg leaves SP unchanged upon return. 802 // To save m->curg->sched.pc, we push it onto the stack. 803 // This has the added benefit that it looks to the traceback 804 // routine like cgocallbackg is going to return to that 805 // PC (because the frame we allocate below has the same 806 // size as cgocallback_gofunc's frame declared above) 807 // so that the traceback will seamlessly trace back into 808 // the earlier calls. 809 // 810 // In the new goroutine, 8(SP) holds the saved R8. 811 MOVQ m_curg(BX), SI 812 MOVQ SI, g(CX) 813 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 814 MOVQ (g_sched+gobuf_pc)(SI), BX 815 MOVQ BX, -8(DI) 816 // Compute the size of the frame, including return PC and, if 817 // GOEXPERIMENT=framepointer, the saved base pointer 818 MOVQ ctxt+24(FP), BX 819 LEAQ fv+0(FP), AX 820 SUBQ SP, AX 821 SUBQ AX, DI 822 MOVQ DI, SP 823 824 MOVQ R8, 8(SP) 825 MOVQ BX, 0(SP) 826 CALL runtime·cgocallbackg(SB) 827 MOVQ 8(SP), R8 828 829 // Compute the size of the frame again. FP and SP have 830 // completely different values here than they did above, 831 // but only their difference matters. 832 LEAQ fv+0(FP), AX 833 SUBQ SP, AX 834 835 // Restore g->sched (== m->curg->sched) from saved values. 836 get_tls(CX) 837 MOVQ g(CX), SI 838 MOVQ SP, DI 839 ADDQ AX, DI 840 MOVQ -8(DI), BX 841 MOVQ BX, (g_sched+gobuf_pc)(SI) 842 MOVQ DI, (g_sched+gobuf_sp)(SI) 843 844 // Switch back to m->g0's stack and restore m->g0->sched.sp. 845 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 846 // so we do not have to restore it.) 847 MOVQ g(CX), BX 848 MOVQ g_m(BX), BX 849 MOVQ m_g0(BX), SI 850 MOVQ SI, g(CX) 851 MOVQ (g_sched+gobuf_sp)(SI), SP 852 MOVQ 0(SP), AX 853 MOVQ AX, (g_sched+gobuf_sp)(SI) 854 855 // If the m on entry was nil, we called needm above to borrow an m 856 // for the duration of the call. Since the call is over, return it with dropm. 857 CMPQ R8, $0 858 JNE 3(PC) 859 MOVQ $runtime·dropm(SB), AX 860 CALL AX 861 862 // Done! 863 RET 864 865 // void setg(G*); set g. for use by needm. 866 TEXT runtime·setg(SB), NOSPLIT, $0-8 867 MOVQ gg+0(FP), BX 868 #ifdef GOOS_windows 869 CMPQ BX, $0 870 JNE settls 871 MOVQ $0, 0x28(GS) 872 RET 873 settls: 874 MOVQ g_m(BX), AX 875 LEAQ m_tls(AX), AX 876 MOVQ AX, 0x28(GS) 877 #endif 878 get_tls(CX) 879 MOVQ BX, g(CX) 880 RET 881 882 // void setg_gcc(G*); set g called from gcc. 883 TEXT setg_gcc<>(SB),NOSPLIT,$0 884 get_tls(AX) 885 MOVQ DI, g(AX) 886 RET 887 888 // check that SP is in range [g->stack.lo, g->stack.hi) 889 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 890 get_tls(CX) 891 MOVQ g(CX), AX 892 CMPQ (g_stack+stack_hi)(AX), SP 893 JHI 2(PC) 894 INT $3 895 CMPQ SP, (g_stack+stack_lo)(AX) 896 JHI 2(PC) 897 INT $3 898 RET 899 900 // func cputicks() int64 901 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 902 CMPB runtime·lfenceBeforeRdtsc(SB), $1 903 JNE mfence 904 LFENCE 905 JMP done 906 mfence: 907 MFENCE 908 done: 909 RDTSC 910 SHLQ $32, DX 911 ADDQ DX, AX 912 MOVQ AX, ret+0(FP) 913 RET 914 915 // hash function using AES hardware instructions 916 TEXT runtime·aeshash(SB),NOSPLIT,$0-32 917 MOVQ p+0(FP), AX // ptr to data 918 MOVQ s+16(FP), CX // size 919 LEAQ ret+24(FP), DX 920 JMP runtime·aeshashbody(SB) 921 922 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24 923 MOVQ p+0(FP), AX // ptr to string struct 924 MOVQ 8(AX), CX // length of string 925 MOVQ (AX), AX // string data 926 LEAQ ret+16(FP), DX 927 JMP runtime·aeshashbody(SB) 928 929 // AX: data 930 // CX: length 931 // DX: address to put return value 932 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0 933 // Fill an SSE register with our seeds. 934 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 935 PINSRW $4, CX, X0 // 16 bits of length 936 PSHUFHW $0, X0, X0 // repeat length 4 times total 937 MOVO X0, X1 // save unscrambled seed 938 PXOR runtime·aeskeysched(SB), X0 // xor in per-process seed 939 AESENC X0, X0 // scramble seed 940 941 CMPQ CX, $16 942 JB aes0to15 943 JE aes16 944 CMPQ CX, $32 945 JBE aes17to32 946 CMPQ CX, $64 947 JBE aes33to64 948 CMPQ CX, $128 949 JBE aes65to128 950 JMP aes129plus 951 952 aes0to15: 953 TESTQ CX, CX 954 JE aes0 955 956 ADDQ $16, AX 957 TESTW $0xff0, AX 958 JE endofpage 959 960 // 16 bytes loaded at this address won't cross 961 // a page boundary, so we can load it directly. 962 MOVOU -16(AX), X1 963 ADDQ CX, CX 964 MOVQ $masks<>(SB), AX 965 PAND (AX)(CX*8), X1 966 final1: 967 PXOR X0, X1 // xor data with seed 968 AESENC X1, X1 // scramble combo 3 times 969 AESENC X1, X1 970 AESENC X1, X1 971 MOVQ X1, (DX) 972 RET 973 974 endofpage: 975 // address ends in 1111xxxx. Might be up against 976 // a page boundary, so load ending at last byte. 977 // Then shift bytes down using pshufb. 978 MOVOU -32(AX)(CX*1), X1 979 ADDQ CX, CX 980 MOVQ $shifts<>(SB), AX 981 PSHUFB (AX)(CX*8), X1 982 JMP final1 983 984 aes0: 985 // Return scrambled input seed 986 AESENC X0, X0 987 MOVQ X0, (DX) 988 RET 989 990 aes16: 991 MOVOU (AX), X1 992 JMP final1 993 994 aes17to32: 995 // make second starting seed 996 PXOR runtime·aeskeysched+16(SB), X1 997 AESENC X1, X1 998 999 // load data to be hashed 1000 MOVOU (AX), X2 1001 MOVOU -16(AX)(CX*1), X3 1002 1003 // xor with seed 1004 PXOR X0, X2 1005 PXOR X1, X3 1006 1007 // scramble 3 times 1008 AESENC X2, X2 1009 AESENC X3, X3 1010 AESENC X2, X2 1011 AESENC X3, X3 1012 AESENC X2, X2 1013 AESENC X3, X3 1014 1015 // combine results 1016 PXOR X3, X2 1017 MOVQ X2, (DX) 1018 RET 1019 1020 aes33to64: 1021 // make 3 more starting seeds 1022 MOVO X1, X2 1023 MOVO X1, X3 1024 PXOR runtime·aeskeysched+16(SB), X1 1025 PXOR runtime·aeskeysched+32(SB), X2 1026 PXOR runtime·aeskeysched+48(SB), X3 1027 AESENC X1, X1 1028 AESENC X2, X2 1029 AESENC X3, X3 1030 1031 MOVOU (AX), X4 1032 MOVOU 16(AX), X5 1033 MOVOU -32(AX)(CX*1), X6 1034 MOVOU -16(AX)(CX*1), X7 1035 1036 PXOR X0, X4 1037 PXOR X1, X5 1038 PXOR X2, X6 1039 PXOR X3, X7 1040 1041 AESENC X4, X4 1042 AESENC X5, X5 1043 AESENC X6, X6 1044 AESENC X7, X7 1045 1046 AESENC X4, X4 1047 AESENC X5, X5 1048 AESENC X6, X6 1049 AESENC X7, X7 1050 1051 AESENC X4, X4 1052 AESENC X5, X5 1053 AESENC X6, X6 1054 AESENC X7, X7 1055 1056 PXOR X6, X4 1057 PXOR X7, X5 1058 PXOR X5, X4 1059 MOVQ X4, (DX) 1060 RET 1061 1062 aes65to128: 1063 // make 7 more starting seeds 1064 MOVO X1, X2 1065 MOVO X1, X3 1066 MOVO X1, X4 1067 MOVO X1, X5 1068 MOVO X1, X6 1069 MOVO X1, X7 1070 PXOR runtime·aeskeysched+16(SB), X1 1071 PXOR runtime·aeskeysched+32(SB), X2 1072 PXOR runtime·aeskeysched+48(SB), X3 1073 PXOR runtime·aeskeysched+64(SB), X4 1074 PXOR runtime·aeskeysched+80(SB), X5 1075 PXOR runtime·aeskeysched+96(SB), X6 1076 PXOR runtime·aeskeysched+112(SB), X7 1077 AESENC X1, X1 1078 AESENC X2, X2 1079 AESENC X3, X3 1080 AESENC X4, X4 1081 AESENC X5, X5 1082 AESENC X6, X6 1083 AESENC X7, X7 1084 1085 // load data 1086 MOVOU (AX), X8 1087 MOVOU 16(AX), X9 1088 MOVOU 32(AX), X10 1089 MOVOU 48(AX), X11 1090 MOVOU -64(AX)(CX*1), X12 1091 MOVOU -48(AX)(CX*1), X13 1092 MOVOU -32(AX)(CX*1), X14 1093 MOVOU -16(AX)(CX*1), X15 1094 1095 // xor with seed 1096 PXOR X0, X8 1097 PXOR X1, X9 1098 PXOR X2, X10 1099 PXOR X3, X11 1100 PXOR X4, X12 1101 PXOR X5, X13 1102 PXOR X6, X14 1103 PXOR X7, X15 1104 1105 // scramble 3 times 1106 AESENC X8, X8 1107 AESENC X9, X9 1108 AESENC X10, X10 1109 AESENC X11, X11 1110 AESENC X12, X12 1111 AESENC X13, X13 1112 AESENC X14, X14 1113 AESENC X15, X15 1114 1115 AESENC X8, X8 1116 AESENC X9, X9 1117 AESENC X10, X10 1118 AESENC X11, X11 1119 AESENC X12, X12 1120 AESENC X13, X13 1121 AESENC X14, X14 1122 AESENC X15, X15 1123 1124 AESENC X8, X8 1125 AESENC X9, X9 1126 AESENC X10, X10 1127 AESENC X11, X11 1128 AESENC X12, X12 1129 AESENC X13, X13 1130 AESENC X14, X14 1131 AESENC X15, X15 1132 1133 // combine results 1134 PXOR X12, X8 1135 PXOR X13, X9 1136 PXOR X14, X10 1137 PXOR X15, X11 1138 PXOR X10, X8 1139 PXOR X11, X9 1140 PXOR X9, X8 1141 MOVQ X8, (DX) 1142 RET 1143 1144 aes129plus: 1145 // make 7 more starting seeds 1146 MOVO X1, X2 1147 MOVO X1, X3 1148 MOVO X1, X4 1149 MOVO X1, X5 1150 MOVO X1, X6 1151 MOVO X1, X7 1152 PXOR runtime·aeskeysched+16(SB), X1 1153 PXOR runtime·aeskeysched+32(SB), X2 1154 PXOR runtime·aeskeysched+48(SB), X3 1155 PXOR runtime·aeskeysched+64(SB), X4 1156 PXOR runtime·aeskeysched+80(SB), X5 1157 PXOR runtime·aeskeysched+96(SB), X6 1158 PXOR runtime·aeskeysched+112(SB), X7 1159 AESENC X1, X1 1160 AESENC X2, X2 1161 AESENC X3, X3 1162 AESENC X4, X4 1163 AESENC X5, X5 1164 AESENC X6, X6 1165 AESENC X7, X7 1166 1167 // start with last (possibly overlapping) block 1168 MOVOU -128(AX)(CX*1), X8 1169 MOVOU -112(AX)(CX*1), X9 1170 MOVOU -96(AX)(CX*1), X10 1171 MOVOU -80(AX)(CX*1), X11 1172 MOVOU -64(AX)(CX*1), X12 1173 MOVOU -48(AX)(CX*1), X13 1174 MOVOU -32(AX)(CX*1), X14 1175 MOVOU -16(AX)(CX*1), X15 1176 1177 // xor in seed 1178 PXOR X0, X8 1179 PXOR X1, X9 1180 PXOR X2, X10 1181 PXOR X3, X11 1182 PXOR X4, X12 1183 PXOR X5, X13 1184 PXOR X6, X14 1185 PXOR X7, X15 1186 1187 // compute number of remaining 128-byte blocks 1188 DECQ CX 1189 SHRQ $7, CX 1190 1191 aesloop: 1192 // scramble state 1193 AESENC X8, X8 1194 AESENC X9, X9 1195 AESENC X10, X10 1196 AESENC X11, X11 1197 AESENC X12, X12 1198 AESENC X13, X13 1199 AESENC X14, X14 1200 AESENC X15, X15 1201 1202 // scramble state, xor in a block 1203 MOVOU (AX), X0 1204 MOVOU 16(AX), X1 1205 MOVOU 32(AX), X2 1206 MOVOU 48(AX), X3 1207 AESENC X0, X8 1208 AESENC X1, X9 1209 AESENC X2, X10 1210 AESENC X3, X11 1211 MOVOU 64(AX), X4 1212 MOVOU 80(AX), X5 1213 MOVOU 96(AX), X6 1214 MOVOU 112(AX), X7 1215 AESENC X4, X12 1216 AESENC X5, X13 1217 AESENC X6, X14 1218 AESENC X7, X15 1219 1220 ADDQ $128, AX 1221 DECQ CX 1222 JNE aesloop 1223 1224 // 3 more scrambles to finish 1225 AESENC X8, X8 1226 AESENC X9, X9 1227 AESENC X10, X10 1228 AESENC X11, X11 1229 AESENC X12, X12 1230 AESENC X13, X13 1231 AESENC X14, X14 1232 AESENC X15, X15 1233 AESENC X8, X8 1234 AESENC X9, X9 1235 AESENC X10, X10 1236 AESENC X11, X11 1237 AESENC X12, X12 1238 AESENC X13, X13 1239 AESENC X14, X14 1240 AESENC X15, X15 1241 AESENC X8, X8 1242 AESENC X9, X9 1243 AESENC X10, X10 1244 AESENC X11, X11 1245 AESENC X12, X12 1246 AESENC X13, X13 1247 AESENC X14, X14 1248 AESENC X15, X15 1249 1250 PXOR X12, X8 1251 PXOR X13, X9 1252 PXOR X14, X10 1253 PXOR X15, X11 1254 PXOR X10, X8 1255 PXOR X11, X9 1256 PXOR X9, X8 1257 MOVQ X8, (DX) 1258 RET 1259 1260 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24 1261 MOVQ p+0(FP), AX // ptr to data 1262 MOVQ h+8(FP), X0 // seed 1263 PINSRD $2, (AX), X0 // data 1264 AESENC runtime·aeskeysched+0(SB), X0 1265 AESENC runtime·aeskeysched+16(SB), X0 1266 AESENC runtime·aeskeysched+32(SB), X0 1267 MOVQ X0, ret+16(FP) 1268 RET 1269 1270 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24 1271 MOVQ p+0(FP), AX // ptr to data 1272 MOVQ h+8(FP), X0 // seed 1273 PINSRQ $1, (AX), X0 // data 1274 AESENC runtime·aeskeysched+0(SB), X0 1275 AESENC runtime·aeskeysched+16(SB), X0 1276 AESENC runtime·aeskeysched+32(SB), X0 1277 MOVQ X0, ret+16(FP) 1278 RET 1279 1280 // simple mask to get rid of data in the high part of the register. 1281 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1282 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1283 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1284 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1285 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1286 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1287 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1288 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1289 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1290 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1291 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1292 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1293 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1294 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1295 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1296 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1297 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1298 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1299 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1300 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1301 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1302 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1303 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1304 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1305 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1306 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1307 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1308 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1309 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1310 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1311 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1312 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1313 GLOBL masks<>(SB),RODATA,$256 1314 1315 TEXT ·checkASM(SB),NOSPLIT,$0-1 1316 // check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte 1317 MOVQ $masks<>(SB), AX 1318 MOVQ $shifts<>(SB), BX 1319 ORQ BX, AX 1320 TESTQ $15, AX 1321 SETEQ ret+0(FP) 1322 RET 1323 1324 // these are arguments to pshufb. They move data down from 1325 // the high bytes of the register to the low bytes of the register. 1326 // index is how many bytes to move. 1327 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1328 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1329 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1330 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1331 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1332 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1333 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1334 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1335 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1336 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1337 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1338 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1339 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1340 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1341 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1342 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1343 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1344 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1345 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1346 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1347 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1348 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1349 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1350 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1351 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1352 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1353 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1354 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1355 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1356 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1357 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1358 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1359 GLOBL shifts<>(SB),RODATA,$256 1360 1361 // memequal(p, q unsafe.Pointer, size uintptr) bool 1362 TEXT runtime·memequal(SB),NOSPLIT,$0-25 1363 MOVQ a+0(FP), SI 1364 MOVQ b+8(FP), DI 1365 CMPQ SI, DI 1366 JEQ eq 1367 MOVQ size+16(FP), BX 1368 LEAQ ret+24(FP), AX 1369 JMP runtime·memeqbody(SB) 1370 eq: 1371 MOVB $1, ret+24(FP) 1372 RET 1373 1374 // memequal_varlen(a, b unsafe.Pointer) bool 1375 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 1376 MOVQ a+0(FP), SI 1377 MOVQ b+8(FP), DI 1378 CMPQ SI, DI 1379 JEQ eq 1380 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1381 LEAQ ret+16(FP), AX 1382 JMP runtime·memeqbody(SB) 1383 eq: 1384 MOVB $1, ret+16(FP) 1385 RET 1386 1387 // a in SI 1388 // b in DI 1389 // count in BX 1390 // address of result byte in AX 1391 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1392 CMPQ BX, $8 1393 JB small 1394 CMPQ BX, $64 1395 JB bigloop 1396 CMPB runtime·support_avx2(SB), $1 1397 JE hugeloop_avx2 1398 1399 // 64 bytes at a time using xmm registers 1400 hugeloop: 1401 CMPQ BX, $64 1402 JB bigloop 1403 MOVOU (SI), X0 1404 MOVOU (DI), X1 1405 MOVOU 16(SI), X2 1406 MOVOU 16(DI), X3 1407 MOVOU 32(SI), X4 1408 MOVOU 32(DI), X5 1409 MOVOU 48(SI), X6 1410 MOVOU 48(DI), X7 1411 PCMPEQB X1, X0 1412 PCMPEQB X3, X2 1413 PCMPEQB X5, X4 1414 PCMPEQB X7, X6 1415 PAND X2, X0 1416 PAND X6, X4 1417 PAND X4, X0 1418 PMOVMSKB X0, DX 1419 ADDQ $64, SI 1420 ADDQ $64, DI 1421 SUBQ $64, BX 1422 CMPL DX, $0xffff 1423 JEQ hugeloop 1424 MOVB $0, (AX) 1425 RET 1426 1427 // 64 bytes at a time using ymm registers 1428 hugeloop_avx2: 1429 CMPQ BX, $64 1430 JB bigloop_avx2 1431 VMOVDQU (SI), Y0 1432 VMOVDQU (DI), Y1 1433 VMOVDQU 32(SI), Y2 1434 VMOVDQU 32(DI), Y3 1435 VPCMPEQB Y1, Y0, Y4 1436 VPCMPEQB Y2, Y3, Y5 1437 VPAND Y4, Y5, Y6 1438 VPMOVMSKB Y6, DX 1439 ADDQ $64, SI 1440 ADDQ $64, DI 1441 SUBQ $64, BX 1442 CMPL DX, $0xffffffff 1443 JEQ hugeloop_avx2 1444 VZEROUPPER 1445 MOVB $0, (AX) 1446 RET 1447 1448 bigloop_avx2: 1449 VZEROUPPER 1450 1451 // 8 bytes at a time using 64-bit register 1452 bigloop: 1453 CMPQ BX, $8 1454 JBE leftover 1455 MOVQ (SI), CX 1456 MOVQ (DI), DX 1457 ADDQ $8, SI 1458 ADDQ $8, DI 1459 SUBQ $8, BX 1460 CMPQ CX, DX 1461 JEQ bigloop 1462 MOVB $0, (AX) 1463 RET 1464 1465 // remaining 0-8 bytes 1466 leftover: 1467 MOVQ -8(SI)(BX*1), CX 1468 MOVQ -8(DI)(BX*1), DX 1469 CMPQ CX, DX 1470 SETEQ (AX) 1471 RET 1472 1473 small: 1474 CMPQ BX, $0 1475 JEQ equal 1476 1477 LEAQ 0(BX*8), CX 1478 NEGQ CX 1479 1480 CMPB SI, $0xf8 1481 JA si_high 1482 1483 // load at SI won't cross a page boundary. 1484 MOVQ (SI), SI 1485 JMP si_finish 1486 si_high: 1487 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1488 MOVQ -8(SI)(BX*1), SI 1489 SHRQ CX, SI 1490 si_finish: 1491 1492 // same for DI. 1493 CMPB DI, $0xf8 1494 JA di_high 1495 MOVQ (DI), DI 1496 JMP di_finish 1497 di_high: 1498 MOVQ -8(DI)(BX*1), DI 1499 SHRQ CX, DI 1500 di_finish: 1501 1502 SUBQ SI, DI 1503 SHLQ CX, DI 1504 equal: 1505 SETEQ (AX) 1506 RET 1507 1508 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 1509 MOVQ s1_base+0(FP), SI 1510 MOVQ s1_len+8(FP), BX 1511 MOVQ s2_base+16(FP), DI 1512 MOVQ s2_len+24(FP), DX 1513 LEAQ ret+32(FP), R9 1514 JMP runtime·cmpbody(SB) 1515 1516 TEXT bytes·Compare(SB),NOSPLIT,$0-56 1517 MOVQ s1+0(FP), SI 1518 MOVQ s1+8(FP), BX 1519 MOVQ s2+24(FP), DI 1520 MOVQ s2+32(FP), DX 1521 LEAQ res+48(FP), R9 1522 JMP runtime·cmpbody(SB) 1523 1524 // input: 1525 // SI = a 1526 // DI = b 1527 // BX = alen 1528 // DX = blen 1529 // R9 = address of output word (stores -1/0/1 here) 1530 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1531 CMPQ SI, DI 1532 JEQ allsame 1533 CMPQ BX, DX 1534 MOVQ DX, R8 1535 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1536 CMPQ R8, $8 1537 JB small 1538 1539 CMPQ R8, $63 1540 JBE loop 1541 CMPB runtime·support_avx2(SB), $1 1542 JEQ big_loop_avx2 1543 JMP big_loop 1544 loop: 1545 CMPQ R8, $16 1546 JBE _0through16 1547 MOVOU (SI), X0 1548 MOVOU (DI), X1 1549 PCMPEQB X0, X1 1550 PMOVMSKB X1, AX 1551 XORQ $0xffff, AX // convert EQ to NE 1552 JNE diff16 // branch if at least one byte is not equal 1553 ADDQ $16, SI 1554 ADDQ $16, DI 1555 SUBQ $16, R8 1556 JMP loop 1557 1558 diff64: 1559 ADDQ $48, SI 1560 ADDQ $48, DI 1561 JMP diff16 1562 diff48: 1563 ADDQ $32, SI 1564 ADDQ $32, DI 1565 JMP diff16 1566 diff32: 1567 ADDQ $16, SI 1568 ADDQ $16, DI 1569 // AX = bit mask of differences 1570 diff16: 1571 BSFQ AX, BX // index of first byte that differs 1572 XORQ AX, AX 1573 MOVB (SI)(BX*1), CX 1574 CMPB CX, (DI)(BX*1) 1575 SETHI AX 1576 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1577 MOVQ AX, (R9) 1578 RET 1579 1580 // 0 through 16 bytes left, alen>=8, blen>=8 1581 _0through16: 1582 CMPQ R8, $8 1583 JBE _0through8 1584 MOVQ (SI), AX 1585 MOVQ (DI), CX 1586 CMPQ AX, CX 1587 JNE diff8 1588 _0through8: 1589 MOVQ -8(SI)(R8*1), AX 1590 MOVQ -8(DI)(R8*1), CX 1591 CMPQ AX, CX 1592 JEQ allsame 1593 1594 // AX and CX contain parts of a and b that differ. 1595 diff8: 1596 BSWAPQ AX // reverse order of bytes 1597 BSWAPQ CX 1598 XORQ AX, CX 1599 BSRQ CX, CX // index of highest bit difference 1600 SHRQ CX, AX // move a's bit to bottom 1601 ANDQ $1, AX // mask bit 1602 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1603 MOVQ AX, (R9) 1604 RET 1605 1606 // 0-7 bytes in common 1607 small: 1608 LEAQ (R8*8), CX // bytes left -> bits left 1609 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1610 JEQ allsame 1611 1612 // load bytes of a into high bytes of AX 1613 CMPB SI, $0xf8 1614 JA si_high 1615 MOVQ (SI), SI 1616 JMP si_finish 1617 si_high: 1618 MOVQ -8(SI)(R8*1), SI 1619 SHRQ CX, SI 1620 si_finish: 1621 SHLQ CX, SI 1622 1623 // load bytes of b in to high bytes of BX 1624 CMPB DI, $0xf8 1625 JA di_high 1626 MOVQ (DI), DI 1627 JMP di_finish 1628 di_high: 1629 MOVQ -8(DI)(R8*1), DI 1630 SHRQ CX, DI 1631 di_finish: 1632 SHLQ CX, DI 1633 1634 BSWAPQ SI // reverse order of bytes 1635 BSWAPQ DI 1636 XORQ SI, DI // find bit differences 1637 JEQ allsame 1638 BSRQ DI, CX // index of highest bit difference 1639 SHRQ CX, SI // move a's bit to bottom 1640 ANDQ $1, SI // mask bit 1641 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1642 MOVQ AX, (R9) 1643 RET 1644 1645 allsame: 1646 XORQ AX, AX 1647 XORQ CX, CX 1648 CMPQ BX, DX 1649 SETGT AX // 1 if alen > blen 1650 SETEQ CX // 1 if alen == blen 1651 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1652 MOVQ AX, (R9) 1653 RET 1654 1655 // this works for >= 64 bytes of data. 1656 big_loop: 1657 MOVOU (SI), X0 1658 MOVOU (DI), X1 1659 PCMPEQB X0, X1 1660 PMOVMSKB X1, AX 1661 XORQ $0xffff, AX 1662 JNE diff16 1663 1664 MOVOU 16(SI), X0 1665 MOVOU 16(DI), X1 1666 PCMPEQB X0, X1 1667 PMOVMSKB X1, AX 1668 XORQ $0xffff, AX 1669 JNE diff32 1670 1671 MOVOU 32(SI), X0 1672 MOVOU 32(DI), X1 1673 PCMPEQB X0, X1 1674 PMOVMSKB X1, AX 1675 XORQ $0xffff, AX 1676 JNE diff48 1677 1678 MOVOU 48(SI), X0 1679 MOVOU 48(DI), X1 1680 PCMPEQB X0, X1 1681 PMOVMSKB X1, AX 1682 XORQ $0xffff, AX 1683 JNE diff64 1684 1685 ADDQ $64, SI 1686 ADDQ $64, DI 1687 SUBQ $64, R8 1688 CMPQ R8, $64 1689 JBE loop 1690 JMP big_loop 1691 1692 // Compare 64-bytes per loop iteration. 1693 // Loop is unrolled and uses AVX2. 1694 big_loop_avx2: 1695 VMOVDQU (SI), Y2 1696 VMOVDQU (DI), Y3 1697 VMOVDQU 32(SI), Y4 1698 VMOVDQU 32(DI), Y5 1699 VPCMPEQB Y2, Y3, Y0 1700 VPMOVMSKB Y0, AX 1701 XORL $0xffffffff, AX 1702 JNE diff32_avx2 1703 VPCMPEQB Y4, Y5, Y6 1704 VPMOVMSKB Y6, AX 1705 XORL $0xffffffff, AX 1706 JNE diff64_avx2 1707 1708 ADDQ $64, SI 1709 ADDQ $64, DI 1710 SUBQ $64, R8 1711 CMPQ R8, $64 1712 JB big_loop_avx2_exit 1713 JMP big_loop_avx2 1714 1715 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. 1716 diff32_avx2: 1717 VZEROUPPER 1718 JMP diff16 1719 1720 // Same as diff32_avx2, but for last 32 bytes. 1721 diff64_avx2: 1722 VZEROUPPER 1723 JMP diff48 1724 1725 // For <64 bytes remainder jump to normal loop. 1726 big_loop_avx2_exit: 1727 VZEROUPPER 1728 JMP loop 1729 1730 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 1731 MOVQ s+0(FP), DI 1732 // We want len in DX and AX, because PCMPESTRI implicitly consumes them 1733 MOVQ s_len+8(FP), DX 1734 MOVQ c+16(FP), BP 1735 MOVQ c_len+24(FP), AX 1736 MOVQ DI, R10 1737 LEAQ ret+32(FP), R11 1738 JMP runtime·indexShortStr(SB) 1739 1740 TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56 1741 MOVQ s+0(FP), DI 1742 MOVQ s_len+8(FP), DX 1743 MOVQ c+24(FP), BP 1744 MOVQ c_len+32(FP), AX 1745 MOVQ DI, R10 1746 LEAQ ret+48(FP), R11 1747 JMP runtime·indexShortStr(SB) 1748 1749 // AX: length of string, that we are searching for 1750 // DX: length of string, in which we are searching 1751 // DI: pointer to string, in which we are searching 1752 // BP: pointer to string, that we are searching for 1753 // R11: address, where to put return value 1754 TEXT runtime·indexShortStr(SB),NOSPLIT,$0 1755 CMPQ AX, DX 1756 JA fail 1757 CMPQ DX, $16 1758 JAE sse42 1759 no_sse42: 1760 CMPQ AX, $2 1761 JA _3_or_more 1762 MOVW (BP), BP 1763 LEAQ -1(DI)(DX*1), DX 1764 loop2: 1765 MOVW (DI), SI 1766 CMPW SI,BP 1767 JZ success 1768 ADDQ $1,DI 1769 CMPQ DI,DX 1770 JB loop2 1771 JMP fail 1772 _3_or_more: 1773 CMPQ AX, $3 1774 JA _4_or_more 1775 MOVW 1(BP), BX 1776 MOVW (BP), BP 1777 LEAQ -2(DI)(DX*1), DX 1778 loop3: 1779 MOVW (DI), SI 1780 CMPW SI,BP 1781 JZ partial_success3 1782 ADDQ $1,DI 1783 CMPQ DI,DX 1784 JB loop3 1785 JMP fail 1786 partial_success3: 1787 MOVW 1(DI), SI 1788 CMPW SI,BX 1789 JZ success 1790 ADDQ $1,DI 1791 CMPQ DI,DX 1792 JB loop3 1793 JMP fail 1794 _4_or_more: 1795 CMPQ AX, $4 1796 JA _5_or_more 1797 MOVL (BP), BP 1798 LEAQ -3(DI)(DX*1), DX 1799 loop4: 1800 MOVL (DI), SI 1801 CMPL SI,BP 1802 JZ success 1803 ADDQ $1,DI 1804 CMPQ DI,DX 1805 JB loop4 1806 JMP fail 1807 _5_or_more: 1808 CMPQ AX, $7 1809 JA _8_or_more 1810 LEAQ 1(DI)(DX*1), DX 1811 SUBQ AX, DX 1812 MOVL -4(BP)(AX*1), BX 1813 MOVL (BP), BP 1814 loop5to7: 1815 MOVL (DI), SI 1816 CMPL SI,BP 1817 JZ partial_success5to7 1818 ADDQ $1,DI 1819 CMPQ DI,DX 1820 JB loop5to7 1821 JMP fail 1822 partial_success5to7: 1823 MOVL -4(AX)(DI*1), SI 1824 CMPL SI,BX 1825 JZ success 1826 ADDQ $1,DI 1827 CMPQ DI,DX 1828 JB loop5to7 1829 JMP fail 1830 _8_or_more: 1831 CMPQ AX, $8 1832 JA _9_or_more 1833 MOVQ (BP), BP 1834 LEAQ -7(DI)(DX*1), DX 1835 loop8: 1836 MOVQ (DI), SI 1837 CMPQ SI,BP 1838 JZ success 1839 ADDQ $1,DI 1840 CMPQ DI,DX 1841 JB loop8 1842 JMP fail 1843 _9_or_more: 1844 CMPQ AX, $15 1845 JA _16_or_more 1846 LEAQ 1(DI)(DX*1), DX 1847 SUBQ AX, DX 1848 MOVQ -8(BP)(AX*1), BX 1849 MOVQ (BP), BP 1850 loop9to15: 1851 MOVQ (DI), SI 1852 CMPQ SI,BP 1853 JZ partial_success9to15 1854 ADDQ $1,DI 1855 CMPQ DI,DX 1856 JB loop9to15 1857 JMP fail 1858 partial_success9to15: 1859 MOVQ -8(AX)(DI*1), SI 1860 CMPQ SI,BX 1861 JZ success 1862 ADDQ $1,DI 1863 CMPQ DI,DX 1864 JB loop9to15 1865 JMP fail 1866 _16_or_more: 1867 CMPQ AX, $16 1868 JA _17_or_more 1869 MOVOU (BP), X1 1870 LEAQ -15(DI)(DX*1), DX 1871 loop16: 1872 MOVOU (DI), X2 1873 PCMPEQB X1, X2 1874 PMOVMSKB X2, SI 1875 CMPQ SI, $0xffff 1876 JE success 1877 ADDQ $1,DI 1878 CMPQ DI,DX 1879 JB loop16 1880 JMP fail 1881 _17_or_more: 1882 CMPQ AX, $31 1883 JA _32_or_more 1884 LEAQ 1(DI)(DX*1), DX 1885 SUBQ AX, DX 1886 MOVOU -16(BP)(AX*1), X0 1887 MOVOU (BP), X1 1888 loop17to31: 1889 MOVOU (DI), X2 1890 PCMPEQB X1,X2 1891 PMOVMSKB X2, SI 1892 CMPQ SI, $0xffff 1893 JE partial_success17to31 1894 ADDQ $1,DI 1895 CMPQ DI,DX 1896 JB loop17to31 1897 JMP fail 1898 partial_success17to31: 1899 MOVOU -16(AX)(DI*1), X3 1900 PCMPEQB X0, X3 1901 PMOVMSKB X3, SI 1902 CMPQ SI, $0xffff 1903 JE success 1904 ADDQ $1,DI 1905 CMPQ DI,DX 1906 JB loop17to31 1907 JMP fail 1908 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 1909 // So no need to check cpuid 1910 _32_or_more: 1911 CMPQ AX, $32 1912 JA _33_to_63 1913 VMOVDQU (BP), Y1 1914 LEAQ -31(DI)(DX*1), DX 1915 loop32: 1916 VMOVDQU (DI), Y2 1917 VPCMPEQB Y1, Y2, Y3 1918 VPMOVMSKB Y3, SI 1919 CMPL SI, $0xffffffff 1920 JE success_avx2 1921 ADDQ $1,DI 1922 CMPQ DI,DX 1923 JB loop32 1924 JMP fail_avx2 1925 _33_to_63: 1926 LEAQ 1(DI)(DX*1), DX 1927 SUBQ AX, DX 1928 VMOVDQU -32(BP)(AX*1), Y0 1929 VMOVDQU (BP), Y1 1930 loop33to63: 1931 VMOVDQU (DI), Y2 1932 VPCMPEQB Y1, Y2, Y3 1933 VPMOVMSKB Y3, SI 1934 CMPL SI, $0xffffffff 1935 JE partial_success33to63 1936 ADDQ $1,DI 1937 CMPQ DI,DX 1938 JB loop33to63 1939 JMP fail_avx2 1940 partial_success33to63: 1941 VMOVDQU -32(AX)(DI*1), Y3 1942 VPCMPEQB Y0, Y3, Y4 1943 VPMOVMSKB Y4, SI 1944 CMPL SI, $0xffffffff 1945 JE success_avx2 1946 ADDQ $1,DI 1947 CMPQ DI,DX 1948 JB loop33to63 1949 fail_avx2: 1950 VZEROUPPER 1951 fail: 1952 MOVQ $-1, (R11) 1953 RET 1954 success_avx2: 1955 VZEROUPPER 1956 JMP success 1957 sse42: 1958 CMPB runtime·support_sse42(SB), $1 1959 JNE no_sse42 1960 CMPQ AX, $12 1961 // PCMPESTRI is slower than normal compare, 1962 // so using it makes sense only if we advance 4+ bytes per compare 1963 // This value was determined experimentally and is the ~same 1964 // on Nehalem (first with SSE42) and Haswell. 1965 JAE _9_or_more 1966 LEAQ 16(BP), SI 1967 TESTW $0xff0, SI 1968 JEQ no_sse42 1969 MOVOU (BP), X1 1970 LEAQ -15(DI)(DX*1), SI 1971 MOVQ $16, R9 1972 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 1973 loop_sse42: 1974 // 0x0c means: unsigned byte compare (bits 0,1 are 00) 1975 // for equality (bits 2,3 are 11) 1976 // result is not masked or inverted (bits 4,5 are 00) 1977 // and corresponds to first matching byte (bit 6 is 0) 1978 PCMPESTRI $0x0c, (DI), X1 1979 // CX == 16 means no match, 1980 // CX > R9 means partial match at the end of the string, 1981 // otherwise sep is at offset CX from X1 start 1982 CMPQ CX, R9 1983 JBE sse42_success 1984 ADDQ R9, DI 1985 CMPQ DI, SI 1986 JB loop_sse42 1987 PCMPESTRI $0x0c, -1(SI), X1 1988 CMPQ CX, R9 1989 JA fail 1990 LEAQ -1(SI), DI 1991 sse42_success: 1992 ADDQ CX, DI 1993 success: 1994 SUBQ R10, DI 1995 MOVQ DI, (R11) 1996 RET 1997 1998 1999 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 2000 MOVQ s+0(FP), SI 2001 MOVQ s_len+8(FP), BX 2002 MOVB c+24(FP), AL 2003 LEAQ ret+32(FP), R8 2004 JMP runtime·indexbytebody(SB) 2005 2006 TEXT strings·IndexByte(SB),NOSPLIT,$0-32 2007 MOVQ s+0(FP), SI 2008 MOVQ s_len+8(FP), BX 2009 MOVB c+16(FP), AL 2010 LEAQ ret+24(FP), R8 2011 JMP runtime·indexbytebody(SB) 2012 2013 // input: 2014 // SI: data 2015 // BX: data len 2016 // AL: byte sought 2017 // R8: address to put result 2018 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 2019 // Shuffle X0 around so that each byte contains 2020 // the character we're looking for. 2021 MOVD AX, X0 2022 PUNPCKLBW X0, X0 2023 PUNPCKLBW X0, X0 2024 PSHUFL $0, X0, X0 2025 2026 CMPQ BX, $16 2027 JLT small 2028 2029 MOVQ SI, DI 2030 2031 CMPQ BX, $32 2032 JA avx2 2033 sse: 2034 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 2035 JMP sseloopentry 2036 2037 sseloop: 2038 // Move the next 16-byte chunk of the data into X1. 2039 MOVOU (DI), X1 2040 // Compare bytes in X0 to X1. 2041 PCMPEQB X0, X1 2042 // Take the top bit of each byte in X1 and put the result in DX. 2043 PMOVMSKB X1, DX 2044 // Find first set bit, if any. 2045 BSFL DX, DX 2046 JNZ ssesuccess 2047 // Advance to next block. 2048 ADDQ $16, DI 2049 sseloopentry: 2050 CMPQ DI, AX 2051 JB sseloop 2052 2053 // Search the last 16-byte chunk. This chunk may overlap with the 2054 // chunks we've already searched, but that's ok. 2055 MOVQ AX, DI 2056 MOVOU (AX), X1 2057 PCMPEQB X0, X1 2058 PMOVMSKB X1, DX 2059 BSFL DX, DX 2060 JNZ ssesuccess 2061 2062 failure: 2063 MOVQ $-1, (R8) 2064 RET 2065 2066 // We've found a chunk containing the byte. 2067 // The chunk was loaded from DI. 2068 // The index of the matching byte in the chunk is DX. 2069 // The start of the data is SI. 2070 ssesuccess: 2071 SUBQ SI, DI // Compute offset of chunk within data. 2072 ADDQ DX, DI // Add offset of byte within chunk. 2073 MOVQ DI, (R8) 2074 RET 2075 2076 // handle for lengths < 16 2077 small: 2078 TESTQ BX, BX 2079 JEQ failure 2080 2081 // Check if we'll load across a page boundary. 2082 LEAQ 16(SI), AX 2083 TESTW $0xff0, AX 2084 JEQ endofpage 2085 2086 MOVOU (SI), X1 // Load data 2087 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2088 PMOVMSKB X1, DX // Move result bits to integer register. 2089 BSFL DX, DX // Find first set bit. 2090 JZ failure // No set bit, failure. 2091 CMPL DX, BX 2092 JAE failure // Match is past end of data. 2093 MOVQ DX, (R8) 2094 RET 2095 2096 endofpage: 2097 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 2098 PCMPEQB X0, X1 // Compare target byte with each byte in data. 2099 PMOVMSKB X1, DX // Move result bits to integer register. 2100 MOVL BX, CX 2101 SHLL CX, DX 2102 SHRL $16, DX // Shift desired bits down to bottom of register. 2103 BSFL DX, DX // Find first set bit. 2104 JZ failure // No set bit, failure. 2105 MOVQ DX, (R8) 2106 RET 2107 2108 avx2: 2109 CMPB runtime·support_avx2(SB), $1 2110 JNE sse 2111 MOVD AX, X0 2112 LEAQ -32(SI)(BX*1), R11 2113 VPBROADCASTB X0, Y1 2114 avx2_loop: 2115 VMOVDQU (DI), Y2 2116 VPCMPEQB Y1, Y2, Y3 2117 VPTEST Y3, Y3 2118 JNZ avx2success 2119 ADDQ $32, DI 2120 CMPQ DI, R11 2121 JLT avx2_loop 2122 MOVQ R11, DI 2123 VMOVDQU (DI), Y2 2124 VPCMPEQB Y1, Y2, Y3 2125 VPTEST Y3, Y3 2126 JNZ avx2success 2127 VZEROUPPER 2128 MOVQ $-1, (R8) 2129 RET 2130 2131 avx2success: 2132 VPMOVMSKB Y3, DX 2133 BSFL DX, DX 2134 SUBQ SI, DI 2135 ADDQ DI, DX 2136 MOVQ DX, (R8) 2137 VZEROUPPER 2138 RET 2139 2140 TEXT bytes·Equal(SB),NOSPLIT,$0-49 2141 MOVQ a_len+8(FP), BX 2142 MOVQ b_len+32(FP), CX 2143 CMPQ BX, CX 2144 JNE eqret 2145 MOVQ a+0(FP), SI 2146 MOVQ b+24(FP), DI 2147 LEAQ ret+48(FP), AX 2148 JMP runtime·memeqbody(SB) 2149 eqret: 2150 MOVB $0, ret+48(FP) 2151 RET 2152 2153 2154 TEXT bytes·countByte(SB),NOSPLIT,$0-40 2155 MOVQ s+0(FP), SI 2156 MOVQ s_len+8(FP), BX 2157 MOVB c+24(FP), AL 2158 LEAQ ret+32(FP), R8 2159 JMP runtime·countByte(SB) 2160 2161 TEXT strings·countByte(SB),NOSPLIT,$0-32 2162 MOVQ s+0(FP), SI 2163 MOVQ s_len+8(FP), BX 2164 MOVB c+16(FP), AL 2165 LEAQ ret+24(FP), R8 2166 JMP runtime·countByte(SB) 2167 2168 // input: 2169 // SI: data 2170 // BX: data len 2171 // AL: byte sought 2172 // R8: address to put result 2173 // This requires the POPCNT instruction 2174 TEXT runtime·countByte(SB),NOSPLIT,$0 2175 // Shuffle X0 around so that each byte contains 2176 // the character we're looking for. 2177 MOVD AX, X0 2178 PUNPCKLBW X0, X0 2179 PUNPCKLBW X0, X0 2180 PSHUFL $0, X0, X0 2181 2182 CMPQ BX, $16 2183 JLT small 2184 2185 MOVQ $0, R12 // Accumulator 2186 2187 MOVQ SI, DI 2188 2189 CMPQ BX, $32 2190 JA avx2 2191 sse: 2192 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 2193 JMP sseloopentry 2194 2195 sseloop: 2196 // Move the next 16-byte chunk of the data into X1. 2197 MOVOU (DI), X1 2198 // Compare bytes in X0 to X1. 2199 PCMPEQB X0, X1 2200 // Take the top bit of each byte in X1 and put the result in DX. 2201 PMOVMSKB X1, DX 2202 // Count number of matching bytes 2203 POPCNTL DX, DX 2204 // Accumulate into R12 2205 ADDQ DX, R12 2206 // Advance to next block. 2207 ADDQ $16, DI 2208 sseloopentry: 2209 CMPQ DI, AX 2210 JBE sseloop 2211 2212 // Get the number of bytes to consider in the last 16 bytes 2213 ANDQ $15, BX 2214 JZ end 2215 2216 // Create mask to ignore overlap between previous 16 byte block 2217 // and the next. 2218 MOVQ $16,CX 2219 SUBQ BX, CX 2220 MOVQ $0xFFFF, R10 2221 SARQ CL, R10 2222 SALQ CL, R10 2223 2224 // Process the last 16-byte chunk. This chunk may overlap with the 2225 // chunks we've already searched so we need to mask part of it. 2226 MOVOU (AX), X1 2227 PCMPEQB X0, X1 2228 PMOVMSKB X1, DX 2229 // Apply mask 2230 ANDQ R10, DX 2231 POPCNTL DX, DX 2232 ADDQ DX, R12 2233 end: 2234 MOVQ R12, (R8) 2235 RET 2236 2237 // handle for lengths < 16 2238 small: 2239 TESTQ BX, BX 2240 JEQ endzero 2241 2242 // Check if we'll load across a page boundary. 2243 LEAQ 16(SI), AX 2244 TESTW $0xff0, AX 2245 JEQ endofpage 2246 2247 // We must ignore high bytes as they aren't part of our slice. 2248 // Create mask. 2249 MOVB BX, CX 2250 MOVQ $1, R10 2251 SALQ CL, R10 2252 SUBQ $1, R10 2253 2254 // Load data 2255 MOVOU (SI), X1 2256 // Compare target byte with each byte in data. 2257 PCMPEQB X0, X1 2258 // Move result bits to integer register. 2259 PMOVMSKB X1, DX 2260 // Apply mask 2261 ANDQ R10, DX 2262 POPCNTL DX, DX 2263 // Directly return DX, we don't need to accumulate 2264 // since we have <16 bytes. 2265 MOVQ DX, (R8) 2266 RET 2267 endzero: 2268 MOVQ $0, (R8) 2269 RET 2270 2271 endofpage: 2272 // We must ignore low bytes as they aren't part of our slice. 2273 MOVQ $16,CX 2274 SUBQ BX, CX 2275 MOVQ $0xFFFF, R10 2276 SARQ CL, R10 2277 SALQ CL, R10 2278 2279 // Load data into the high end of X1. 2280 MOVOU -16(SI)(BX*1), X1 2281 // Compare target byte with each byte in data. 2282 PCMPEQB X0, X1 2283 // Move result bits to integer register. 2284 PMOVMSKB X1, DX 2285 // Apply mask 2286 ANDQ R10, DX 2287 // Directly return DX, we don't need to accumulate 2288 // since we have <16 bytes. 2289 POPCNTL DX, DX 2290 MOVQ DX, (R8) 2291 RET 2292 2293 avx2: 2294 CMPB runtime·support_avx2(SB), $1 2295 JNE sse 2296 MOVD AX, X0 2297 LEAQ -32(SI)(BX*1), R11 2298 VPBROADCASTB X0, Y1 2299 avx2_loop: 2300 VMOVDQU (DI), Y2 2301 VPCMPEQB Y1, Y2, Y3 2302 VPMOVMSKB Y3, DX 2303 POPCNTL DX, DX 2304 ADDQ DX, R12 2305 ADDQ $32, DI 2306 CMPQ DI, R11 2307 JLE avx2_loop 2308 2309 // If last block is already processed, 2310 // skip to the end. 2311 CMPQ DI, R11 2312 JEQ endavx 2313 2314 // Load address of the last 32 bytes. 2315 // There is an overlap with the previous block. 2316 MOVQ R11, DI 2317 VMOVDQU (DI), Y2 2318 VPCMPEQB Y1, Y2, Y3 2319 VPMOVMSKB Y3, DX 2320 // Exit AVX mode. 2321 VZEROUPPER 2322 2323 // Create mask to ignore overlap between previous 32 byte block 2324 // and the next. 2325 ANDQ $31, BX 2326 MOVQ $32,CX 2327 SUBQ BX, CX 2328 MOVQ $0xFFFFFFFF, R10 2329 SARQ CL, R10 2330 SALQ CL, R10 2331 // Apply mask 2332 ANDQ R10, DX 2333 POPCNTL DX, DX 2334 ADDQ DX, R12 2335 MOVQ R12, (R8) 2336 RET 2337 endavx: 2338 // Exit AVX mode. 2339 VZEROUPPER 2340 MOVQ R12, (R8) 2341 RET 2342 2343 TEXT runtime·return0(SB), NOSPLIT, $0 2344 MOVL $0, AX 2345 RET 2346 2347 2348 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 2349 // Must obey the gcc calling convention. 2350 TEXT _cgo_topofstack(SB),NOSPLIT,$0 2351 get_tls(CX) 2352 MOVQ g(CX), AX 2353 MOVQ g_m(AX), AX 2354 MOVQ m_curg(AX), AX 2355 MOVQ (g_stack+stack_hi)(AX), AX 2356 RET 2357 2358 // The top-most function running on a goroutine 2359 // returns to goexit+PCQuantum. 2360 TEXT runtime·goexit(SB),NOSPLIT,$0-0 2361 BYTE $0x90 // NOP 2362 CALL runtime·goexit1(SB) // does not return 2363 // traceback from goexit1 must hit code range of goexit 2364 BYTE $0x90 // NOP 2365 2366 // This is called from .init_array and follows the platform, not Go, ABI. 2367 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0 2368 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 2369 MOVQ runtime·lastmoduledatap(SB), AX 2370 MOVQ DI, moduledata_next(AX) 2371 MOVQ DI, runtime·lastmoduledatap(SB) 2372 POPQ R15 2373 RET 2374 2375 // gcWriteBarrier performs a heap pointer write and informs the GC. 2376 // 2377 // gcWriteBarrier does NOT follow the Go ABI. It takes two arguments: 2378 // - DI is the destination of the write 2379 // - AX is the value being written at DI 2380 // It clobbers FLAGS. It does not clobber any general-purpose registers, 2381 // but may clobber others (e.g., SSE registers). 2382 TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$120 2383 // Save the registers clobbered by the fast path. This is slightly 2384 // faster than having the caller spill these. 2385 MOVQ R14, 104(SP) 2386 MOVQ R13, 112(SP) 2387 // TODO: Consider passing g.m.p in as an argument so they can be shared 2388 // across a sequence of write barriers. 2389 get_tls(R13) 2390 MOVQ g(R13), R13 2391 MOVQ g_m(R13), R13 2392 MOVQ m_p(R13), R13 2393 MOVQ (p_wbBuf+wbBuf_next)(R13), R14 2394 // Increment wbBuf.next position. 2395 LEAQ 16(R14), R14 2396 MOVQ R14, (p_wbBuf+wbBuf_next)(R13) 2397 CMPQ R14, (p_wbBuf+wbBuf_end)(R13) 2398 // Record the write. 2399 MOVQ AX, -16(R14) // Record value 2400 MOVQ (DI), R13 // TODO: This turns bad writes into bad reads. 2401 MOVQ R13, -8(R14) // Record *slot 2402 // Is the buffer full? (flags set in CMPQ above) 2403 JEQ flush 2404 ret: 2405 MOVQ 104(SP), R14 2406 MOVQ 112(SP), R13 2407 // Do the write. 2408 MOVQ AX, (DI) 2409 RET 2410 2411 flush: 2412 // Save all general purpose registers since these could be 2413 // clobbered by wbBufFlush and were not saved by the caller. 2414 // It is possible for wbBufFlush to clobber other registers 2415 // (e.g., SSE registers), but the compiler takes care of saving 2416 // those in the caller if necessary. This strikes a balance 2417 // with registers that are likely to be used. 2418 // 2419 // We don't have type information for these, but all code under 2420 // here is NOSPLIT, so nothing will observe these. 2421 // 2422 // TODO: We could strike a different balance; e.g., saving X0 2423 // and not saving GP registers that are less likely to be used. 2424 MOVQ DI, 0(SP) // Also first argument to wbBufFlush 2425 MOVQ AX, 8(SP) // Also second argument to wbBufFlush 2426 MOVQ BX, 16(SP) 2427 MOVQ CX, 24(SP) 2428 MOVQ DX, 32(SP) 2429 // DI already saved 2430 MOVQ SI, 40(SP) 2431 MOVQ BP, 48(SP) 2432 MOVQ R8, 56(SP) 2433 MOVQ R9, 64(SP) 2434 MOVQ R10, 72(SP) 2435 MOVQ R11, 80(SP) 2436 MOVQ R12, 88(SP) 2437 // R13 already saved 2438 // R14 already saved 2439 MOVQ R15, 96(SP) 2440 2441 // This takes arguments DI and AX 2442 CALL runtime·wbBufFlush(SB) 2443 2444 MOVQ 0(SP), DI 2445 MOVQ 8(SP), AX 2446 MOVQ 16(SP), BX 2447 MOVQ 24(SP), CX 2448 MOVQ 32(SP), DX 2449 MOVQ 40(SP), SI 2450 MOVQ 48(SP), BP 2451 MOVQ 56(SP), R8 2452 MOVQ 64(SP), R9 2453 MOVQ 72(SP), R10 2454 MOVQ 80(SP), R11 2455 MOVQ 88(SP), R12 2456 MOVQ 96(SP), R15 2457 JMP ret