github.com/filosottile/go@v0.0.0-20170906193555-dbed9972d994/src/runtime/asm_amd64p32.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtime·rt0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVL argc+0(FP), AX 13 MOVL argv+4(FP), BX 14 MOVL SP, CX 15 SUBL $128, CX // plenty of scratch 16 ANDL $~15, CX 17 MOVL CX, SP 18 19 MOVL AX, 16(SP) 20 MOVL BX, 24(SP) 21 22 // create istack out of the given (operating system) stack. 23 MOVL $runtime·g0(SB), DI 24 LEAL (-64*1024+104)(SP), BX 25 MOVL BX, g_stackguard0(DI) 26 MOVL BX, g_stackguard1(DI) 27 MOVL BX, (g_stack+stack_lo)(DI) 28 MOVL SP, (g_stack+stack_hi)(DI) 29 30 // find out information about the processor we're on 31 MOVL $0, AX 32 CPUID 33 CMPL AX, $0 34 JE nocpuinfo 35 36 CMPL BX, $0x756E6547 // "Genu" 37 JNE notintel 38 CMPL DX, $0x49656E69 // "ineI" 39 JNE notintel 40 CMPL CX, $0x6C65746E // "ntel" 41 JNE notintel 42 MOVB $1, runtime·isIntel(SB) 43 notintel: 44 45 // Load EAX=1 cpuid flags 46 MOVL $1, AX 47 CPUID 48 MOVL AX, runtime·processorVersionInfo(SB) 49 50 TESTL $(1<<26), DX // SSE2 51 SETNE runtime·support_sse2(SB) 52 53 TESTL $(1<<9), CX // SSSE3 54 SETNE runtime·support_ssse3(SB) 55 56 TESTL $(1<<19), CX // SSE4.1 57 SETNE runtime·support_sse41(SB) 58 59 TESTL $(1<<20), CX // SSE4.2 60 SETNE runtime·support_sse42(SB) 61 62 TESTL $(1<<23), CX // POPCNT 63 SETNE runtime·support_popcnt(SB) 64 65 TESTL $(1<<25), CX // AES 66 SETNE runtime·support_aes(SB) 67 68 TESTL $(1<<27), CX // OSXSAVE 69 SETNE runtime·support_osxsave(SB) 70 71 // If OS support for XMM and YMM is not present 72 // support_avx will be set back to false later. 73 TESTL $(1<<28), CX // AVX 74 SETNE runtime·support_avx(SB) 75 76 eax7: 77 // Load EAX=7/ECX=0 cpuid flags 78 CMPL SI, $7 79 JLT osavx 80 MOVL $7, AX 81 MOVL $0, CX 82 CPUID 83 84 TESTL $(1<<3), BX // BMI1 85 SETNE runtime·support_bmi1(SB) 86 87 // If OS support for XMM and YMM is not present 88 // support_avx2 will be set back to false later. 89 TESTL $(1<<5), BX 90 SETNE runtime·support_avx2(SB) 91 92 TESTL $(1<<8), BX // BMI2 93 SETNE runtime·support_bmi2(SB) 94 95 TESTL $(1<<9), BX // ERMS 96 SETNE runtime·support_erms(SB) 97 98 osavx: 99 // nacl does not support XGETBV to test 100 // for XMM and YMM OS support. 101 #ifndef GOOS_nacl 102 CMPB runtime·support_osxsave(SB), $1 103 JNE noavx 104 MOVL $0, CX 105 // For XGETBV, OSXSAVE bit is required and sufficient 106 XGETBV 107 ANDL $6, AX 108 CMPL AX, $6 // Check for OS support of XMM and YMM registers. 109 JE nocpuinfo 110 #endif 111 noavx: 112 MOVB $0, runtime·support_avx(SB) 113 MOVB $0, runtime·support_avx2(SB) 114 115 nocpuinfo: 116 117 needtls: 118 LEAL runtime·m0+m_tls(SB), DI 119 CALL runtime·settls(SB) 120 121 // store through it, to make sure it works 122 get_tls(BX) 123 MOVQ $0x123, g(BX) 124 MOVQ runtime·m0+m_tls(SB), AX 125 CMPQ AX, $0x123 126 JEQ 2(PC) 127 MOVL AX, 0 // abort 128 ok: 129 // set the per-goroutine and per-mach "registers" 130 get_tls(BX) 131 LEAL runtime·g0(SB), CX 132 MOVL CX, g(BX) 133 LEAL runtime·m0(SB), AX 134 135 // save m->g0 = g0 136 MOVL CX, m_g0(AX) 137 // save m0 to g0->m 138 MOVL AX, g_m(CX) 139 140 CLD // convention is D is always left cleared 141 CALL runtime·check(SB) 142 143 MOVL 16(SP), AX // copy argc 144 MOVL AX, 0(SP) 145 MOVL 24(SP), AX // copy argv 146 MOVL AX, 4(SP) 147 CALL runtime·args(SB) 148 CALL runtime·osinit(SB) 149 CALL runtime·schedinit(SB) 150 151 // create a new goroutine to start program 152 MOVL $runtime·mainPC(SB), AX // entry 153 MOVL $0, 0(SP) 154 MOVL AX, 4(SP) 155 CALL runtime·newproc(SB) 156 157 // start this M 158 CALL runtime·mstart(SB) 159 160 MOVL $0xf1, 0xf1 // crash 161 RET 162 163 DATA runtime·mainPC+0(SB)/4,$runtime·main(SB) 164 GLOBL runtime·mainPC(SB),RODATA,$4 165 166 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 167 INT $3 168 RET 169 170 TEXT runtime·asminit(SB),NOSPLIT,$0-0 171 // No per-thread init. 172 RET 173 174 /* 175 * go-routine 176 */ 177 178 // void gosave(Gobuf*) 179 // save state in Gobuf; setjmp 180 TEXT runtime·gosave(SB), NOSPLIT, $0-4 181 MOVL buf+0(FP), AX // gobuf 182 LEAL buf+0(FP), BX // caller's SP 183 MOVL BX, gobuf_sp(AX) 184 MOVL 0(SP), BX // caller's PC 185 MOVL BX, gobuf_pc(AX) 186 MOVQ $0, gobuf_ret(AX) 187 // Assert ctxt is zero. See func save. 188 MOVL gobuf_ctxt(AX), BX 189 TESTL BX, BX 190 JZ 2(PC) 191 CALL runtime·badctxt(SB) 192 get_tls(CX) 193 MOVL g(CX), BX 194 MOVL BX, gobuf_g(AX) 195 RET 196 197 // void gogo(Gobuf*) 198 // restore state from Gobuf; longjmp 199 TEXT runtime·gogo(SB), NOSPLIT, $8-4 200 MOVL buf+0(FP), BX // gobuf 201 202 // If ctxt is not nil, invoke deletion barrier before overwriting. 203 MOVL gobuf_ctxt(BX), DX 204 TESTL DX, DX 205 JZ nilctxt 206 LEAL gobuf_ctxt(BX), AX 207 MOVL AX, 0(SP) 208 MOVL $0, 4(SP) 209 CALL runtime·writebarrierptr_prewrite(SB) 210 MOVL buf+0(FP), BX 211 212 nilctxt: 213 MOVL gobuf_g(BX), DX 214 MOVL 0(DX), CX // make sure g != nil 215 get_tls(CX) 216 MOVL DX, g(CX) 217 MOVL gobuf_sp(BX), SP // restore SP 218 MOVL gobuf_ctxt(BX), DX 219 MOVQ gobuf_ret(BX), AX 220 MOVL $0, gobuf_sp(BX) // clear to help garbage collector 221 MOVQ $0, gobuf_ret(BX) 222 MOVL $0, gobuf_ctxt(BX) 223 MOVL gobuf_pc(BX), BX 224 JMP BX 225 226 // func mcall(fn func(*g)) 227 // Switch to m->g0's stack, call fn(g). 228 // Fn must never return. It should gogo(&g->sched) 229 // to keep running g. 230 TEXT runtime·mcall(SB), NOSPLIT, $0-4 231 MOVL fn+0(FP), DI 232 233 get_tls(CX) 234 MOVL g(CX), AX // save state in g->sched 235 MOVL 0(SP), BX // caller's PC 236 MOVL BX, (g_sched+gobuf_pc)(AX) 237 LEAL fn+0(FP), BX // caller's SP 238 MOVL BX, (g_sched+gobuf_sp)(AX) 239 MOVL AX, (g_sched+gobuf_g)(AX) 240 241 // switch to m->g0 & its stack, call fn 242 MOVL g(CX), BX 243 MOVL g_m(BX), BX 244 MOVL m_g0(BX), SI 245 CMPL SI, AX // if g == m->g0 call badmcall 246 JNE 3(PC) 247 MOVL $runtime·badmcall(SB), AX 248 JMP AX 249 MOVL SI, g(CX) // g = m->g0 250 MOVL (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 251 PUSHQ AX 252 MOVL DI, DX 253 MOVL 0(DI), DI 254 CALL DI 255 POPQ AX 256 MOVL $runtime·badmcall2(SB), AX 257 JMP AX 258 RET 259 260 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 261 // of the G stack. We need to distinguish the routine that 262 // lives at the bottom of the G stack from the one that lives 263 // at the top of the system stack because the one at the top of 264 // the system stack terminates the stack walk (see topofstack()). 265 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0 266 RET 267 268 // func systemstack(fn func()) 269 TEXT runtime·systemstack(SB), NOSPLIT, $0-4 270 MOVL fn+0(FP), DI // DI = fn 271 get_tls(CX) 272 MOVL g(CX), AX // AX = g 273 MOVL g_m(AX), BX // BX = m 274 275 MOVL m_gsignal(BX), DX // DX = gsignal 276 CMPL AX, DX 277 JEQ noswitch 278 279 MOVL m_g0(BX), DX // DX = g0 280 CMPL AX, DX 281 JEQ noswitch 282 283 MOVL m_curg(BX), R8 284 CMPL AX, R8 285 JEQ switch 286 287 // Not g0, not curg. Must be gsignal, but that's not allowed. 288 // Hide call from linker nosplit analysis. 289 MOVL $runtime·badsystemstack(SB), AX 290 CALL AX 291 292 switch: 293 // save our state in g->sched. Pretend to 294 // be systemstack_switch if the G stack is scanned. 295 MOVL $runtime·systemstack_switch(SB), SI 296 MOVL SI, (g_sched+gobuf_pc)(AX) 297 MOVL SP, (g_sched+gobuf_sp)(AX) 298 MOVL AX, (g_sched+gobuf_g)(AX) 299 300 // switch to g0 301 MOVL DX, g(CX) 302 MOVL (g_sched+gobuf_sp)(DX), SP 303 304 // call target function 305 MOVL DI, DX 306 MOVL 0(DI), DI 307 CALL DI 308 309 // switch back to g 310 get_tls(CX) 311 MOVL g(CX), AX 312 MOVL g_m(AX), BX 313 MOVL m_curg(BX), AX 314 MOVL AX, g(CX) 315 MOVL (g_sched+gobuf_sp)(AX), SP 316 MOVL $0, (g_sched+gobuf_sp)(AX) 317 RET 318 319 noswitch: 320 // already on m stack, just call directly 321 MOVL DI, DX 322 MOVL 0(DI), DI 323 CALL DI 324 RET 325 326 /* 327 * support for morestack 328 */ 329 330 // Called during function prolog when more stack is needed. 331 // 332 // The traceback routines see morestack on a g0 as being 333 // the top of a stack (for example, morestack calling newstack 334 // calling the scheduler calling newm calling gc), so we must 335 // record an argument size. For that purpose, it has no arguments. 336 TEXT runtime·morestack(SB),NOSPLIT,$0-0 337 get_tls(CX) 338 MOVL g(CX), BX 339 MOVL g_m(BX), BX 340 341 // Cannot grow scheduler stack (m->g0). 342 MOVL m_g0(BX), SI 343 CMPL g(CX), SI 344 JNE 3(PC) 345 CALL runtime·badmorestackg0(SB) 346 MOVL 0, AX 347 348 // Cannot grow signal stack (m->gsignal). 349 MOVL m_gsignal(BX), SI 350 CMPL g(CX), SI 351 JNE 3(PC) 352 CALL runtime·badmorestackgsignal(SB) 353 MOVL 0, AX 354 355 // Called from f. 356 // Set m->morebuf to f's caller. 357 MOVL 8(SP), AX // f's caller's PC 358 MOVL AX, (m_morebuf+gobuf_pc)(BX) 359 LEAL 16(SP), AX // f's caller's SP 360 MOVL AX, (m_morebuf+gobuf_sp)(BX) 361 get_tls(CX) 362 MOVL g(CX), SI 363 MOVL SI, (m_morebuf+gobuf_g)(BX) 364 365 // Set g->sched to context in f. 366 MOVL 0(SP), AX // f's PC 367 MOVL AX, (g_sched+gobuf_pc)(SI) 368 MOVL SI, (g_sched+gobuf_g)(SI) 369 LEAL 8(SP), AX // f's SP 370 MOVL AX, (g_sched+gobuf_sp)(SI) 371 // newstack will fill gobuf.ctxt. 372 373 // Call newstack on m->g0's stack. 374 MOVL m_g0(BX), BX 375 MOVL BX, g(CX) 376 MOVL (g_sched+gobuf_sp)(BX), SP 377 PUSHQ DX // ctxt argument 378 CALL runtime·newstack(SB) 379 MOVL $0, 0x1003 // crash if newstack returns 380 POPQ DX // keep balance check happy 381 RET 382 383 // morestack trampolines 384 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0 385 MOVL $0, DX 386 JMP runtime·morestack(SB) 387 388 // reflectcall: call a function with the given argument list 389 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 390 // we don't have variable-sized frames, so we use a small number 391 // of constant-sized-frame functions to encode a few bits of size in the pc. 392 // Caution: ugly multiline assembly macros in your future! 393 394 #define DISPATCH(NAME,MAXSIZE) \ 395 CMPL CX, $MAXSIZE; \ 396 JA 3(PC); \ 397 MOVL $NAME(SB), AX; \ 398 JMP AX 399 // Note: can't just "JMP NAME(SB)" - bad inlining results. 400 401 TEXT reflect·call(SB), NOSPLIT, $0-0 402 JMP ·reflectcall(SB) 403 404 TEXT ·reflectcall(SB), NOSPLIT, $0-20 405 MOVLQZX argsize+12(FP), CX 406 DISPATCH(runtime·call16, 16) 407 DISPATCH(runtime·call32, 32) 408 DISPATCH(runtime·call64, 64) 409 DISPATCH(runtime·call128, 128) 410 DISPATCH(runtime·call256, 256) 411 DISPATCH(runtime·call512, 512) 412 DISPATCH(runtime·call1024, 1024) 413 DISPATCH(runtime·call2048, 2048) 414 DISPATCH(runtime·call4096, 4096) 415 DISPATCH(runtime·call8192, 8192) 416 DISPATCH(runtime·call16384, 16384) 417 DISPATCH(runtime·call32768, 32768) 418 DISPATCH(runtime·call65536, 65536) 419 DISPATCH(runtime·call131072, 131072) 420 DISPATCH(runtime·call262144, 262144) 421 DISPATCH(runtime·call524288, 524288) 422 DISPATCH(runtime·call1048576, 1048576) 423 DISPATCH(runtime·call2097152, 2097152) 424 DISPATCH(runtime·call4194304, 4194304) 425 DISPATCH(runtime·call8388608, 8388608) 426 DISPATCH(runtime·call16777216, 16777216) 427 DISPATCH(runtime·call33554432, 33554432) 428 DISPATCH(runtime·call67108864, 67108864) 429 DISPATCH(runtime·call134217728, 134217728) 430 DISPATCH(runtime·call268435456, 268435456) 431 DISPATCH(runtime·call536870912, 536870912) 432 DISPATCH(runtime·call1073741824, 1073741824) 433 MOVL $runtime·badreflectcall(SB), AX 434 JMP AX 435 436 #define CALLFN(NAME,MAXSIZE) \ 437 TEXT NAME(SB), WRAPPER, $MAXSIZE-20; \ 438 NO_LOCAL_POINTERS; \ 439 /* copy arguments to stack */ \ 440 MOVL argptr+8(FP), SI; \ 441 MOVL argsize+12(FP), CX; \ 442 MOVL SP, DI; \ 443 REP;MOVSB; \ 444 /* call function */ \ 445 MOVL f+4(FP), DX; \ 446 MOVL (DX), AX; \ 447 CALL AX; \ 448 /* copy return values back */ \ 449 MOVL argtype+0(FP), DX; \ 450 MOVL argptr+8(FP), DI; \ 451 MOVL argsize+12(FP), CX; \ 452 MOVL retoffset+16(FP), BX; \ 453 MOVL SP, SI; \ 454 ADDL BX, DI; \ 455 ADDL BX, SI; \ 456 SUBL BX, CX; \ 457 CALL callRet<>(SB); \ 458 RET 459 460 // callRet copies return values back at the end of call*. This is a 461 // separate function so it can allocate stack space for the arguments 462 // to reflectcallmove. It does not follow the Go ABI; it expects its 463 // arguments in registers. 464 TEXT callRet<>(SB), NOSPLIT, $16-0 465 MOVL DX, 0(SP) 466 MOVL DI, 4(SP) 467 MOVL SI, 8(SP) 468 MOVL CX, 12(SP) 469 CALL runtime·reflectcallmove(SB) 470 RET 471 472 CALLFN(·call16, 16) 473 CALLFN(·call32, 32) 474 CALLFN(·call64, 64) 475 CALLFN(·call128, 128) 476 CALLFN(·call256, 256) 477 CALLFN(·call512, 512) 478 CALLFN(·call1024, 1024) 479 CALLFN(·call2048, 2048) 480 CALLFN(·call4096, 4096) 481 CALLFN(·call8192, 8192) 482 CALLFN(·call16384, 16384) 483 CALLFN(·call32768, 32768) 484 CALLFN(·call65536, 65536) 485 CALLFN(·call131072, 131072) 486 CALLFN(·call262144, 262144) 487 CALLFN(·call524288, 524288) 488 CALLFN(·call1048576, 1048576) 489 CALLFN(·call2097152, 2097152) 490 CALLFN(·call4194304, 4194304) 491 CALLFN(·call8388608, 8388608) 492 CALLFN(·call16777216, 16777216) 493 CALLFN(·call33554432, 33554432) 494 CALLFN(·call67108864, 67108864) 495 CALLFN(·call134217728, 134217728) 496 CALLFN(·call268435456, 268435456) 497 CALLFN(·call536870912, 536870912) 498 CALLFN(·call1073741824, 1073741824) 499 500 TEXT runtime·procyield(SB),NOSPLIT,$0-0 501 MOVL cycles+0(FP), AX 502 again: 503 PAUSE 504 SUBL $1, AX 505 JNZ again 506 RET 507 508 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0 509 // Stores are already ordered on x86, so this is just a 510 // compile barrier. 511 RET 512 513 // void jmpdefer(fn, sp); 514 // called from deferreturn. 515 // 1. pop the caller 516 // 2. sub 5 bytes from the callers return 517 // 3. jmp to the argument 518 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8 519 MOVL fv+0(FP), DX 520 MOVL argp+4(FP), BX 521 LEAL -8(BX), SP // caller sp after CALL 522 SUBL $5, (SP) // return to CALL again 523 MOVL 0(DX), BX 524 JMP BX // but first run the deferred function 525 526 // func asmcgocall(fn, arg unsafe.Pointer) int32 527 // Not implemented. 528 TEXT runtime·asmcgocall(SB),NOSPLIT,$0-12 529 MOVL 0, AX 530 RET 531 532 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize) 533 // Not implemented. 534 TEXT runtime·cgocallback(SB),NOSPLIT,$0-16 535 MOVL 0, AX 536 RET 537 538 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) 539 // Not implemented. 540 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$0-16 541 MOVL 0, AX 542 RET 543 544 // void setg(G*); set g. for use by needm. 545 // Not implemented. 546 TEXT runtime·setg(SB), NOSPLIT, $0-4 547 MOVL 0, AX 548 RET 549 550 // check that SP is in range [g->stack.lo, g->stack.hi) 551 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 552 get_tls(CX) 553 MOVL g(CX), AX 554 CMPL (g_stack+stack_hi)(AX), SP 555 JHI 2(PC) 556 MOVL 0, AX 557 CMPL SP, (g_stack+stack_lo)(AX) 558 JHI 2(PC) 559 MOVL 0, AX 560 RET 561 562 TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-8 563 MOVL ptr+0(FP), DI 564 MOVL n+4(FP), CX 565 MOVQ CX, BX 566 ANDQ $3, BX 567 SHRQ $2, CX 568 MOVQ $0, AX 569 CLD 570 REP 571 STOSL 572 MOVQ BX, CX 573 REP 574 STOSB 575 // Note: we zero only 4 bytes at a time so that the tail is at most 576 // 3 bytes. That guarantees that we aren't zeroing pointers with STOSB. 577 // See issue 13160. 578 RET 579 580 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-12 581 MOVL argp+0(FP),AX // addr of first arg 582 MOVL -8(AX),AX // get calling pc 583 MOVL AX, ret+8(FP) 584 RET 585 586 // int64 runtime·cputicks(void) 587 TEXT runtime·cputicks(SB),NOSPLIT,$0-0 588 RDTSC 589 SHLQ $32, DX 590 ADDQ DX, AX 591 MOVQ AX, ret+0(FP) 592 RET 593 594 // hash function using AES hardware instructions 595 // For now, our one amd64p32 system (NaCl) does not 596 // support using AES instructions, so have not bothered to 597 // write the implementations. Can copy and adjust the ones 598 // in asm_amd64.s when the time comes. 599 600 TEXT runtime·aeshash(SB),NOSPLIT,$0-20 601 MOVL AX, ret+16(FP) 602 RET 603 604 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12 605 MOVL AX, ret+8(FP) 606 RET 607 608 TEXT runtime·aeshash32(SB),NOSPLIT,$0-12 609 MOVL AX, ret+8(FP) 610 RET 611 612 TEXT runtime·aeshash64(SB),NOSPLIT,$0-12 613 MOVL AX, ret+8(FP) 614 RET 615 616 // memequal(p, q unsafe.Pointer, size uintptr) bool 617 TEXT runtime·memequal(SB),NOSPLIT,$0-17 618 MOVL a+0(FP), SI 619 MOVL b+4(FP), DI 620 CMPL SI, DI 621 JEQ eq 622 MOVL size+8(FP), BX 623 CALL runtime·memeqbody(SB) 624 MOVB AX, ret+16(FP) 625 RET 626 eq: 627 MOVB $1, ret+16(FP) 628 RET 629 630 // memequal_varlen(a, b unsafe.Pointer) bool 631 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9 632 MOVL a+0(FP), SI 633 MOVL b+4(FP), DI 634 CMPL SI, DI 635 JEQ eq 636 MOVL 4(DX), BX // compiler stores size at offset 4 in the closure 637 CALL runtime·memeqbody(SB) 638 MOVB AX, ret+8(FP) 639 RET 640 eq: 641 MOVB $1, ret+8(FP) 642 RET 643 644 // a in SI 645 // b in DI 646 // count in BX 647 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 648 XORQ AX, AX 649 650 CMPQ BX, $8 651 JB small 652 653 // 64 bytes at a time using xmm registers 654 hugeloop: 655 CMPQ BX, $64 656 JB bigloop 657 MOVOU (SI), X0 658 MOVOU (DI), X1 659 MOVOU 16(SI), X2 660 MOVOU 16(DI), X3 661 MOVOU 32(SI), X4 662 MOVOU 32(DI), X5 663 MOVOU 48(SI), X6 664 MOVOU 48(DI), X7 665 PCMPEQB X1, X0 666 PCMPEQB X3, X2 667 PCMPEQB X5, X4 668 PCMPEQB X7, X6 669 PAND X2, X0 670 PAND X6, X4 671 PAND X4, X0 672 PMOVMSKB X0, DX 673 ADDQ $64, SI 674 ADDQ $64, DI 675 SUBQ $64, BX 676 CMPL DX, $0xffff 677 JEQ hugeloop 678 RET 679 680 // 8 bytes at a time using 64-bit register 681 bigloop: 682 CMPQ BX, $8 683 JBE leftover 684 MOVQ (SI), CX 685 MOVQ (DI), DX 686 ADDQ $8, SI 687 ADDQ $8, DI 688 SUBQ $8, BX 689 CMPQ CX, DX 690 JEQ bigloop 691 RET 692 693 // remaining 0-8 bytes 694 leftover: 695 ADDQ BX, SI 696 ADDQ BX, DI 697 MOVQ -8(SI), CX 698 MOVQ -8(DI), DX 699 CMPQ CX, DX 700 SETEQ AX 701 RET 702 703 small: 704 CMPQ BX, $0 705 JEQ equal 706 707 LEAQ 0(BX*8), CX 708 NEGQ CX 709 710 CMPB SI, $0xf8 711 JA si_high 712 713 // load at SI won't cross a page boundary. 714 MOVQ (SI), SI 715 JMP si_finish 716 si_high: 717 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 718 MOVQ BX, DX 719 ADDQ SI, DX 720 MOVQ -8(DX), SI 721 SHRQ CX, SI 722 si_finish: 723 724 // same for DI. 725 CMPB DI, $0xf8 726 JA di_high 727 MOVQ (DI), DI 728 JMP di_finish 729 di_high: 730 MOVQ BX, DX 731 ADDQ DI, DX 732 MOVQ -8(DX), DI 733 SHRQ CX, DI 734 di_finish: 735 736 SUBQ SI, DI 737 SHLQ CX, DI 738 equal: 739 SETEQ AX 740 RET 741 742 TEXT runtime·cmpstring(SB),NOSPLIT,$0-20 743 MOVL s1_base+0(FP), SI 744 MOVL s1_len+4(FP), BX 745 MOVL s2_base+8(FP), DI 746 MOVL s2_len+12(FP), DX 747 CALL runtime·cmpbody(SB) 748 MOVL AX, ret+16(FP) 749 RET 750 751 TEXT bytes·Compare(SB),NOSPLIT,$0-28 752 MOVL s1+0(FP), SI 753 MOVL s1+4(FP), BX 754 MOVL s2+12(FP), DI 755 MOVL s2+16(FP), DX 756 CALL runtime·cmpbody(SB) 757 MOVL AX, res+24(FP) 758 RET 759 760 // input: 761 // SI = a 762 // DI = b 763 // BX = alen 764 // DX = blen 765 // output: 766 // AX = 1/0/-1 767 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 768 CMPQ SI, DI 769 JEQ allsame 770 CMPQ BX, DX 771 MOVQ DX, R8 772 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 773 CMPQ R8, $8 774 JB small 775 776 loop: 777 CMPQ R8, $16 778 JBE _0through16 779 MOVOU (SI), X0 780 MOVOU (DI), X1 781 PCMPEQB X0, X1 782 PMOVMSKB X1, AX 783 XORQ $0xffff, AX // convert EQ to NE 784 JNE diff16 // branch if at least one byte is not equal 785 ADDQ $16, SI 786 ADDQ $16, DI 787 SUBQ $16, R8 788 JMP loop 789 790 // AX = bit mask of differences 791 diff16: 792 BSFQ AX, BX // index of first byte that differs 793 XORQ AX, AX 794 ADDQ BX, SI 795 MOVB (SI), CX 796 ADDQ BX, DI 797 CMPB CX, (DI) 798 SETHI AX 799 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 800 RET 801 802 // 0 through 16 bytes left, alen>=8, blen>=8 803 _0through16: 804 CMPQ R8, $8 805 JBE _0through8 806 MOVQ (SI), AX 807 MOVQ (DI), CX 808 CMPQ AX, CX 809 JNE diff8 810 _0through8: 811 ADDQ R8, SI 812 ADDQ R8, DI 813 MOVQ -8(SI), AX 814 MOVQ -8(DI), CX 815 CMPQ AX, CX 816 JEQ allsame 817 818 // AX and CX contain parts of a and b that differ. 819 diff8: 820 BSWAPQ AX // reverse order of bytes 821 BSWAPQ CX 822 XORQ AX, CX 823 BSRQ CX, CX // index of highest bit difference 824 SHRQ CX, AX // move a's bit to bottom 825 ANDQ $1, AX // mask bit 826 LEAQ -1(AX*2), AX // 1/0 => +1/-1 827 RET 828 829 // 0-7 bytes in common 830 small: 831 LEAQ (R8*8), CX // bytes left -> bits left 832 NEGQ CX // - bits lift (== 64 - bits left mod 64) 833 JEQ allsame 834 835 // load bytes of a into high bytes of AX 836 CMPB SI, $0xf8 837 JA si_high 838 MOVQ (SI), SI 839 JMP si_finish 840 si_high: 841 ADDQ R8, SI 842 MOVQ -8(SI), SI 843 SHRQ CX, SI 844 si_finish: 845 SHLQ CX, SI 846 847 // load bytes of b in to high bytes of BX 848 CMPB DI, $0xf8 849 JA di_high 850 MOVQ (DI), DI 851 JMP di_finish 852 di_high: 853 ADDQ R8, DI 854 MOVQ -8(DI), DI 855 SHRQ CX, DI 856 di_finish: 857 SHLQ CX, DI 858 859 BSWAPQ SI // reverse order of bytes 860 BSWAPQ DI 861 XORQ SI, DI // find bit differences 862 JEQ allsame 863 BSRQ DI, CX // index of highest bit difference 864 SHRQ CX, SI // move a's bit to bottom 865 ANDQ $1, SI // mask bit 866 LEAQ -1(SI*2), AX // 1/0 => +1/-1 867 RET 868 869 allsame: 870 XORQ AX, AX 871 XORQ CX, CX 872 CMPQ BX, DX 873 SETGT AX // 1 if alen > blen 874 SETEQ CX // 1 if alen == blen 875 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 876 RET 877 878 TEXT bytes·IndexByte(SB),NOSPLIT,$0-20 879 MOVL s+0(FP), SI 880 MOVL s_len+4(FP), BX 881 MOVB c+12(FP), AL 882 CALL runtime·indexbytebody(SB) 883 MOVL AX, ret+16(FP) 884 RET 885 886 TEXT strings·IndexByte(SB),NOSPLIT,$0-20 887 MOVL s+0(FP), SI 888 MOVL s_len+4(FP), BX 889 MOVB c+8(FP), AL 890 CALL runtime·indexbytebody(SB) 891 MOVL AX, ret+16(FP) 892 RET 893 894 // input: 895 // SI: data 896 // BX: data len 897 // AL: byte sought 898 // output: 899 // AX 900 TEXT runtime·indexbytebody(SB),NOSPLIT,$0 901 MOVL SI, DI 902 903 CMPL BX, $16 904 JLT small 905 906 // round up to first 16-byte boundary 907 TESTL $15, SI 908 JZ aligned 909 MOVL SI, CX 910 ANDL $~15, CX 911 ADDL $16, CX 912 913 // search the beginning 914 SUBL SI, CX 915 REPN; SCASB 916 JZ success 917 918 // DI is 16-byte aligned; get ready to search using SSE instructions 919 aligned: 920 // round down to last 16-byte boundary 921 MOVL BX, R11 922 ADDL SI, R11 923 ANDL $~15, R11 924 925 // shuffle X0 around so that each byte contains c 926 MOVD AX, X0 927 PUNPCKLBW X0, X0 928 PUNPCKLBW X0, X0 929 PSHUFL $0, X0, X0 930 JMP condition 931 932 sse: 933 // move the next 16-byte chunk of the buffer into X1 934 MOVO (DI), X1 935 // compare bytes in X0 to X1 936 PCMPEQB X0, X1 937 // take the top bit of each byte in X1 and put the result in DX 938 PMOVMSKB X1, DX 939 TESTL DX, DX 940 JNZ ssesuccess 941 ADDL $16, DI 942 943 condition: 944 CMPL DI, R11 945 JLT sse 946 947 // search the end 948 MOVL SI, CX 949 ADDL BX, CX 950 SUBL R11, CX 951 // if CX == 0, the zero flag will be set and we'll end up 952 // returning a false success 953 JZ failure 954 REPN; SCASB 955 JZ success 956 957 failure: 958 MOVL $-1, AX 959 RET 960 961 // handle for lengths < 16 962 small: 963 MOVL BX, CX 964 REPN; SCASB 965 JZ success 966 MOVL $-1, AX 967 RET 968 969 // we've found the chunk containing the byte 970 // now just figure out which specific byte it is 971 ssesuccess: 972 // get the index of the least significant set bit 973 BSFW DX, DX 974 SUBL SI, DI 975 ADDL DI, DX 976 MOVL DX, AX 977 RET 978 979 success: 980 SUBL SI, DI 981 SUBL $1, DI 982 MOVL DI, AX 983 RET 984 985 TEXT bytes·Equal(SB),NOSPLIT,$0-25 986 MOVL a_len+4(FP), BX 987 MOVL b_len+16(FP), CX 988 XORL AX, AX 989 CMPL BX, CX 990 JNE eqret 991 MOVL a+0(FP), SI 992 MOVL b+12(FP), DI 993 CALL runtime·memeqbody(SB) 994 eqret: 995 MOVB AX, ret+24(FP) 996 RET 997 998 TEXT runtime·return0(SB), NOSPLIT, $0 999 MOVL $0, AX 1000 RET 1001 1002 // The top-most function running on a goroutine 1003 // returns to goexit+PCQuantum. 1004 TEXT runtime·goexit(SB),NOSPLIT,$0-0 1005 BYTE $0x90 // NOP 1006 CALL runtime·goexit1(SB) // does not return 1007 // traceback from goexit1 must hit code range of goexit 1008 BYTE $0x90 // NOP 1009 1010 TEXT ·checkASM(SB),NOSPLIT,$0-1 1011 MOVB $1, ret+0(FP) 1012 RET