github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/src/pkg/runtime/asm_386.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "zasm_GOOS_GOARCH.h" 6 #include "funcdata.h" 7 #include "../../cmd/ld/textflag.h" 8 9 TEXT _rt0_go(SB),NOSPLIT,$0 10 // copy arguments forward on an even stack 11 MOVL argc+0(FP), AX 12 MOVL argv+4(FP), BX 13 SUBL $128, SP // plenty of scratch 14 ANDL $~15, SP 15 MOVL AX, 120(SP) // save argc, argv away 16 MOVL BX, 124(SP) 17 18 // set default stack bounds. 19 // _cgo_init may update stackguard. 20 MOVL $runtime·g0(SB), BP 21 LEAL (-64*1024+104)(SP), BX 22 MOVL BX, g_stackguard(BP) 23 MOVL BX, g_stackguard0(BP) 24 MOVL SP, g_stackbase(BP) 25 26 // find out information about the processor we're on 27 MOVL $0, AX 28 CPUID 29 CMPL AX, $0 30 JE nocpuinfo 31 MOVL $1, AX 32 CPUID 33 MOVL CX, runtime·cpuid_ecx(SB) 34 MOVL DX, runtime·cpuid_edx(SB) 35 nocpuinfo: 36 37 // if there is an _cgo_init, call it to let it 38 // initialize and to set up GS. if not, 39 // we set up GS ourselves. 40 MOVL _cgo_init(SB), AX 41 TESTL AX, AX 42 JZ needtls 43 MOVL $setmg_gcc<>(SB), BX 44 MOVL BX, 4(SP) 45 MOVL BP, 0(SP) 46 CALL AX 47 // update stackguard after _cgo_init 48 MOVL $runtime·g0(SB), CX 49 MOVL g_stackguard0(CX), AX 50 MOVL AX, g_stackguard(CX) 51 // skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows 52 CMPL runtime·iswindows(SB), $0 53 JEQ ok 54 needtls: 55 // skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases 56 CMPL runtime·isplan9(SB), $1 57 JEQ ok 58 59 // set up %gs 60 CALL runtime·ldt0setup(SB) 61 62 // store through it, to make sure it works 63 get_tls(BX) 64 MOVL $0x123, g(BX) 65 MOVL runtime·tls0(SB), AX 66 CMPL AX, $0x123 67 JEQ ok 68 MOVL AX, 0 // abort 69 ok: 70 // set up m and g "registers" 71 get_tls(BX) 72 LEAL runtime·g0(SB), CX 73 MOVL CX, g(BX) 74 LEAL runtime·m0(SB), AX 75 MOVL AX, m(BX) 76 77 // save m->g0 = g0 78 MOVL CX, m_g0(AX) 79 80 CALL runtime·emptyfunc(SB) // fault if stack check is wrong 81 82 // convention is D is always cleared 83 CLD 84 85 CALL runtime·check(SB) 86 87 // saved argc, argv 88 MOVL 120(SP), AX 89 MOVL AX, 0(SP) 90 MOVL 124(SP), AX 91 MOVL AX, 4(SP) 92 CALL runtime·args(SB) 93 CALL runtime·osinit(SB) 94 CALL runtime·hashinit(SB) 95 CALL runtime·schedinit(SB) 96 97 // create a new goroutine to start program 98 PUSHL $runtime·main·f(SB) // entry 99 PUSHL $0 // arg size 100 ARGSIZE(8) 101 CALL runtime·newproc(SB) 102 ARGSIZE(-1) 103 POPL AX 104 POPL AX 105 106 // start this M 107 CALL runtime·mstart(SB) 108 109 INT $3 110 RET 111 112 DATA runtime·main·f+0(SB)/4,$runtime·main(SB) 113 GLOBL runtime·main·f(SB),RODATA,$4 114 115 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0 116 INT $3 117 RET 118 119 TEXT runtime·asminit(SB),NOSPLIT,$0-0 120 // Linux and MinGW start the FPU in extended double precision. 121 // Other operating systems use double precision. 122 // Change to double precision to match them, 123 // and to match other hardware that only has double. 124 PUSHL $0x27F 125 FLDCW 0(SP) 126 POPL AX 127 RET 128 129 /* 130 * go-routine 131 */ 132 133 // void gosave(Gobuf*) 134 // save state in Gobuf; setjmp 135 TEXT runtime·gosave(SB), NOSPLIT, $0-4 136 MOVL 4(SP), AX // gobuf 137 LEAL 4(SP), BX // caller's SP 138 MOVL BX, gobuf_sp(AX) 139 MOVL 0(SP), BX // caller's PC 140 MOVL BX, gobuf_pc(AX) 141 MOVL $0, gobuf_ret(AX) 142 MOVL $0, gobuf_ctxt(AX) 143 get_tls(CX) 144 MOVL g(CX), BX 145 MOVL BX, gobuf_g(AX) 146 RET 147 148 // void gogo(Gobuf*) 149 // restore state from Gobuf; longjmp 150 TEXT runtime·gogo(SB), NOSPLIT, $0-4 151 MOVL 4(SP), BX // gobuf 152 MOVL gobuf_g(BX), DX 153 MOVL 0(DX), CX // make sure g != nil 154 get_tls(CX) 155 MOVL DX, g(CX) 156 MOVL gobuf_sp(BX), SP // restore SP 157 MOVL gobuf_ret(BX), AX 158 MOVL gobuf_ctxt(BX), DX 159 MOVL $0, gobuf_sp(BX) // clear to help garbage collector 160 MOVL $0, gobuf_ret(BX) 161 MOVL $0, gobuf_ctxt(BX) 162 MOVL gobuf_pc(BX), BX 163 JMP BX 164 165 // void mcall(void (*fn)(G*)) 166 // Switch to m->g0's stack, call fn(g). 167 // Fn must never return. It should gogo(&g->sched) 168 // to keep running g. 169 TEXT runtime·mcall(SB), NOSPLIT, $0-4 170 MOVL fn+0(FP), DI 171 172 get_tls(CX) 173 MOVL g(CX), AX // save state in g->sched 174 MOVL 0(SP), BX // caller's PC 175 MOVL BX, (g_sched+gobuf_pc)(AX) 176 LEAL 4(SP), BX // caller's SP 177 MOVL BX, (g_sched+gobuf_sp)(AX) 178 MOVL AX, (g_sched+gobuf_g)(AX) 179 180 // switch to m->g0 & its stack, call fn 181 MOVL m(CX), BX 182 MOVL m_g0(BX), SI 183 CMPL SI, AX // if g == m->g0 call badmcall 184 JNE 3(PC) 185 MOVL $runtime·badmcall(SB), AX 186 JMP AX 187 MOVL SI, g(CX) // g = m->g0 188 MOVL (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 189 PUSHL AX 190 CALL DI 191 POPL AX 192 MOVL $runtime·badmcall2(SB), AX 193 JMP AX 194 RET 195 196 /* 197 * support for morestack 198 */ 199 200 // Called during function prolog when more stack is needed. 201 // 202 // The traceback routines see morestack on a g0 as being 203 // the top of a stack (for example, morestack calling newstack 204 // calling the scheduler calling newm calling gc), so we must 205 // record an argument size. For that purpose, it has no arguments. 206 TEXT runtime·morestack(SB),NOSPLIT,$0-0 207 // Cannot grow scheduler stack (m->g0). 208 get_tls(CX) 209 MOVL m(CX), BX 210 MOVL m_g0(BX), SI 211 CMPL g(CX), SI 212 JNE 2(PC) 213 INT $3 214 215 // frame size in DI 216 // arg size in AX 217 // Save in m. 218 MOVL DI, m_moreframesize(BX) 219 MOVL AX, m_moreargsize(BX) 220 221 // Called from f. 222 // Set m->morebuf to f's caller. 223 MOVL 4(SP), DI // f's caller's PC 224 MOVL DI, (m_morebuf+gobuf_pc)(BX) 225 LEAL 8(SP), CX // f's caller's SP 226 MOVL CX, (m_morebuf+gobuf_sp)(BX) 227 MOVL CX, m_moreargp(BX) 228 get_tls(CX) 229 MOVL g(CX), SI 230 MOVL SI, (m_morebuf+gobuf_g)(BX) 231 232 // Set g->sched to context in f. 233 MOVL 0(SP), AX // f's PC 234 MOVL AX, (g_sched+gobuf_pc)(SI) 235 MOVL SI, (g_sched+gobuf_g)(SI) 236 LEAL 4(SP), AX // f's SP 237 MOVL AX, (g_sched+gobuf_sp)(SI) 238 MOVL DX, (g_sched+gobuf_ctxt)(SI) 239 240 // Call newstack on m->g0's stack. 241 MOVL m_g0(BX), BP 242 MOVL BP, g(CX) 243 MOVL (g_sched+gobuf_sp)(BP), AX 244 MOVL -4(AX), BX // fault if CALL would, before smashing SP 245 MOVL AX, SP 246 CALL runtime·newstack(SB) 247 MOVL $0, 0x1003 // crash if newstack returns 248 RET 249 250 // Called from panic. Mimics morestack, 251 // reuses stack growth code to create a frame 252 // with the desired args running the desired function. 253 // 254 // func call(fn *byte, arg *byte, argsize uint32). 255 TEXT runtime·newstackcall(SB), NOSPLIT, $0-12 256 get_tls(CX) 257 MOVL m(CX), BX 258 259 // Save our caller's state as the PC and SP to 260 // restore when returning from f. 261 MOVL 0(SP), AX // our caller's PC 262 MOVL AX, (m_morebuf+gobuf_pc)(BX) 263 LEAL 4(SP), AX // our caller's SP 264 MOVL AX, (m_morebuf+gobuf_sp)(BX) 265 MOVL g(CX), AX 266 MOVL AX, (m_morebuf+gobuf_g)(BX) 267 268 // Save our own state as the PC and SP to restore 269 // if this goroutine needs to be restarted. 270 MOVL $runtime·newstackcall(SB), (g_sched+gobuf_pc)(AX) 271 MOVL SP, (g_sched+gobuf_sp)(AX) 272 273 // Set up morestack arguments to call f on a new stack. 274 // We set f's frame size to 1, as a hint to newstack 275 // that this is a call from runtime·newstackcall. 276 // If it turns out that f needs a larger frame than 277 // the default stack, f's usual stack growth prolog will 278 // allocate a new segment (and recopy the arguments). 279 MOVL 4(SP), AX // fn 280 MOVL 8(SP), DX // arg frame 281 MOVL 12(SP), CX // arg size 282 283 MOVL AX, m_cret(BX) // f's PC 284 MOVL DX, m_moreargp(BX) // f's argument pointer 285 MOVL CX, m_moreargsize(BX) // f's argument size 286 MOVL $1, m_moreframesize(BX) // f's frame size 287 288 // Call newstack on m->g0's stack. 289 MOVL m_g0(BX), BP 290 get_tls(CX) 291 MOVL BP, g(CX) 292 MOVL (g_sched+gobuf_sp)(BP), SP 293 CALL runtime·newstack(SB) 294 MOVL $0, 0x1103 // crash if newstack returns 295 RET 296 297 // reflect·call: call a function with the given argument list 298 // func call(f *FuncVal, arg *byte, argsize uint32). 299 // we don't have variable-sized frames, so we use a small number 300 // of constant-sized-frame functions to encode a few bits of size in the pc. 301 // Caution: ugly multiline assembly macros in your future! 302 303 #define DISPATCH(NAME,MAXSIZE) \ 304 CMPL CX, $MAXSIZE; \ 305 JA 3(PC); \ 306 MOVL $runtime·NAME(SB), AX; \ 307 JMP AX 308 // Note: can't just "JMP runtime·NAME(SB)" - bad inlining results. 309 310 TEXT reflect·call(SB), NOSPLIT, $0-12 311 MOVL argsize+8(FP), CX 312 DISPATCH(call16, 16) 313 DISPATCH(call32, 32) 314 DISPATCH(call64, 64) 315 DISPATCH(call128, 128) 316 DISPATCH(call256, 256) 317 DISPATCH(call512, 512) 318 DISPATCH(call1024, 1024) 319 DISPATCH(call2048, 2048) 320 DISPATCH(call4096, 4096) 321 DISPATCH(call8192, 8192) 322 DISPATCH(call16384, 16384) 323 DISPATCH(call32768, 32768) 324 DISPATCH(call65536, 65536) 325 DISPATCH(call131072, 131072) 326 DISPATCH(call262144, 262144) 327 DISPATCH(call524288, 524288) 328 DISPATCH(call1048576, 1048576) 329 DISPATCH(call2097152, 2097152) 330 DISPATCH(call4194304, 4194304) 331 DISPATCH(call8388608, 8388608) 332 DISPATCH(call16777216, 16777216) 333 DISPATCH(call33554432, 33554432) 334 DISPATCH(call67108864, 67108864) 335 DISPATCH(call134217728, 134217728) 336 DISPATCH(call268435456, 268435456) 337 DISPATCH(call536870912, 536870912) 338 DISPATCH(call1073741824, 1073741824) 339 MOVL $runtime·badreflectcall(SB), AX 340 JMP AX 341 342 #define CALLFN(NAME,MAXSIZE) \ 343 TEXT runtime·NAME(SB), 0, $MAXSIZE-12; \ 344 /* copy arguments to stack */ \ 345 MOVL argptr+4(FP), SI; \ 346 MOVL argsize+8(FP), CX; \ 347 MOVL SP, DI; \ 348 REP;MOVSB; \ 349 /* call function */ \ 350 MOVL f+0(FP), DX; \ 351 CALL (DX); \ 352 /* copy return values back */ \ 353 MOVL argptr+4(FP), DI; \ 354 MOVL argsize+8(FP), CX; \ 355 MOVL SP, SI; \ 356 REP;MOVSB; \ 357 RET 358 359 CALLFN(call16, 16) 360 CALLFN(call32, 32) 361 CALLFN(call64, 64) 362 CALLFN(call128, 128) 363 CALLFN(call256, 256) 364 CALLFN(call512, 512) 365 CALLFN(call1024, 1024) 366 CALLFN(call2048, 2048) 367 CALLFN(call4096, 4096) 368 CALLFN(call8192, 8192) 369 CALLFN(call16384, 16384) 370 CALLFN(call32768, 32768) 371 CALLFN(call65536, 65536) 372 CALLFN(call131072, 131072) 373 CALLFN(call262144, 262144) 374 CALLFN(call524288, 524288) 375 CALLFN(call1048576, 1048576) 376 CALLFN(call2097152, 2097152) 377 CALLFN(call4194304, 4194304) 378 CALLFN(call8388608, 8388608) 379 CALLFN(call16777216, 16777216) 380 CALLFN(call33554432, 33554432) 381 CALLFN(call67108864, 67108864) 382 CALLFN(call134217728, 134217728) 383 CALLFN(call268435456, 268435456) 384 CALLFN(call536870912, 536870912) 385 CALLFN(call1073741824, 1073741824) 386 387 // Return point when leaving stack. 388 // 389 // Lessstack can appear in stack traces for the same reason 390 // as morestack; in that context, it has 0 arguments. 391 TEXT runtime·lessstack(SB), NOSPLIT, $0-0 392 // Save return value in m->cret 393 get_tls(CX) 394 MOVL m(CX), BX 395 MOVL AX, m_cret(BX) 396 397 // Call oldstack on m->g0's stack. 398 MOVL m_g0(BX), BP 399 MOVL BP, g(CX) 400 MOVL (g_sched+gobuf_sp)(BP), SP 401 CALL runtime·oldstack(SB) 402 MOVL $0, 0x1004 // crash if oldstack returns 403 RET 404 405 406 // bool cas(int32 *val, int32 old, int32 new) 407 // Atomically: 408 // if(*val == old){ 409 // *val = new; 410 // return 1; 411 // }else 412 // return 0; 413 TEXT runtime·cas(SB), NOSPLIT, $0-12 414 MOVL 4(SP), BX 415 MOVL 8(SP), AX 416 MOVL 12(SP), CX 417 LOCK 418 CMPXCHGL CX, 0(BX) 419 JZ 3(PC) 420 MOVL $0, AX 421 RET 422 MOVL $1, AX 423 RET 424 425 // bool runtime·cas64(uint64 *val, uint64 old, uint64 new) 426 // Atomically: 427 // if(*val == *old){ 428 // *val = new; 429 // return 1; 430 // } else { 431 // return 0; 432 // } 433 TEXT runtime·cas64(SB), NOSPLIT, $0-20 434 MOVL 4(SP), BP 435 MOVL 8(SP), AX 436 MOVL 12(SP), DX 437 MOVL 16(SP), BX 438 MOVL 20(SP), CX 439 LOCK 440 CMPXCHG8B 0(BP) 441 JNZ cas64_fail 442 MOVL $1, AX 443 RET 444 cas64_fail: 445 MOVL $0, AX 446 RET 447 448 // bool casp(void **p, void *old, void *new) 449 // Atomically: 450 // if(*p == old){ 451 // *p = new; 452 // return 1; 453 // }else 454 // return 0; 455 TEXT runtime·casp(SB), NOSPLIT, $0-12 456 MOVL 4(SP), BX 457 MOVL 8(SP), AX 458 MOVL 12(SP), CX 459 LOCK 460 CMPXCHGL CX, 0(BX) 461 JZ 3(PC) 462 MOVL $0, AX 463 RET 464 MOVL $1, AX 465 RET 466 467 // uint32 xadd(uint32 volatile *val, int32 delta) 468 // Atomically: 469 // *val += delta; 470 // return *val; 471 TEXT runtime·xadd(SB), NOSPLIT, $0-8 472 MOVL 4(SP), BX 473 MOVL 8(SP), AX 474 MOVL AX, CX 475 LOCK 476 XADDL AX, 0(BX) 477 ADDL CX, AX 478 RET 479 480 TEXT runtime·xchg(SB), NOSPLIT, $0-8 481 MOVL 4(SP), BX 482 MOVL 8(SP), AX 483 XCHGL AX, 0(BX) 484 RET 485 486 TEXT runtime·procyield(SB),NOSPLIT,$0-0 487 MOVL 4(SP), AX 488 again: 489 PAUSE 490 SUBL $1, AX 491 JNZ again 492 RET 493 494 TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8 495 MOVL 4(SP), BX 496 MOVL 8(SP), AX 497 XCHGL AX, 0(BX) 498 RET 499 500 TEXT runtime·atomicstore(SB), NOSPLIT, $0-8 501 MOVL 4(SP), BX 502 MOVL 8(SP), AX 503 XCHGL AX, 0(BX) 504 RET 505 506 // uint64 atomicload64(uint64 volatile* addr); 507 // so actually 508 // void atomicload64(uint64 *res, uint64 volatile *addr); 509 TEXT runtime·atomicload64(SB), NOSPLIT, $0-8 510 MOVL 4(SP), BX 511 MOVL 8(SP), AX 512 // MOVQ (%EAX), %MM0 513 BYTE $0x0f; BYTE $0x6f; BYTE $0x00 514 // MOVQ %MM0, 0(%EBX) 515 BYTE $0x0f; BYTE $0x7f; BYTE $0x03 516 // EMMS 517 BYTE $0x0F; BYTE $0x77 518 RET 519 520 // void runtime·atomicstore64(uint64 volatile* addr, uint64 v); 521 TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12 522 MOVL 4(SP), AX 523 // MOVQ and EMMS were introduced on the Pentium MMX. 524 // MOVQ 0x8(%ESP), %MM0 525 BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08 526 // MOVQ %MM0, (%EAX) 527 BYTE $0x0f; BYTE $0x7f; BYTE $0x00 528 // EMMS 529 BYTE $0x0F; BYTE $0x77 530 // This is essentially a no-op, but it provides required memory fencing. 531 // It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2). 532 MOVL $0, AX 533 LOCK 534 XADDL AX, (SP) 535 RET 536 537 // void jmpdefer(fn, sp); 538 // called from deferreturn. 539 // 1. pop the caller 540 // 2. sub 5 bytes from the callers return 541 // 3. jmp to the argument 542 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8 543 MOVL 4(SP), DX // fn 544 MOVL 8(SP), BX // caller sp 545 LEAL -4(BX), SP // caller sp after CALL 546 SUBL $5, (SP) // return to CALL again 547 MOVL 0(DX), BX 548 JMP BX // but first run the deferred function 549 550 // Save state of caller into g->sched. 551 TEXT gosave<>(SB),NOSPLIT,$0 552 PUSHL AX 553 PUSHL BX 554 get_tls(BX) 555 MOVL g(BX), BX 556 LEAL arg+0(FP), AX 557 MOVL AX, (g_sched+gobuf_sp)(BX) 558 MOVL -4(AX), AX 559 MOVL AX, (g_sched+gobuf_pc)(BX) 560 MOVL $0, (g_sched+gobuf_ret)(BX) 561 MOVL $0, (g_sched+gobuf_ctxt)(BX) 562 POPL BX 563 POPL AX 564 RET 565 566 // asmcgocall(void(*fn)(void*), void *arg) 567 // Call fn(arg) on the scheduler stack, 568 // aligned appropriately for the gcc ABI. 569 // See cgocall.c for more details. 570 TEXT runtime·asmcgocall(SB),NOSPLIT,$0-8 571 MOVL fn+0(FP), AX 572 MOVL arg+4(FP), BX 573 MOVL SP, DX 574 575 // Figure out if we need to switch to m->g0 stack. 576 // We get called to create new OS threads too, and those 577 // come in on the m->g0 stack already. 578 get_tls(CX) 579 MOVL m(CX), BP 580 MOVL m_g0(BP), SI 581 MOVL g(CX), DI 582 CMPL SI, DI 583 JEQ 4(PC) 584 CALL gosave<>(SB) 585 MOVL SI, g(CX) 586 MOVL (g_sched+gobuf_sp)(SI), SP 587 588 // Now on a scheduling stack (a pthread-created stack). 589 SUBL $32, SP 590 ANDL $~15, SP // alignment, perhaps unnecessary 591 MOVL DI, 8(SP) // save g 592 MOVL DX, 4(SP) // save SP 593 MOVL BX, 0(SP) // first argument in x86-32 ABI 594 CALL AX 595 596 // Restore registers, g, stack pointer. 597 get_tls(CX) 598 MOVL 8(SP), DI 599 MOVL DI, g(CX) 600 MOVL 4(SP), SP 601 RET 602 603 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize) 604 // Turn the fn into a Go func (by taking its address) and call 605 // cgocallback_gofunc. 606 TEXT runtime·cgocallback(SB),NOSPLIT,$12-12 607 LEAL fn+0(FP), AX 608 MOVL AX, 0(SP) 609 MOVL frame+4(FP), AX 610 MOVL AX, 4(SP) 611 MOVL framesize+8(FP), AX 612 MOVL AX, 8(SP) 613 MOVL $runtime·cgocallback_gofunc(SB), AX 614 CALL AX 615 RET 616 617 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) 618 // See cgocall.c for more details. 619 TEXT runtime·cgocallback_gofunc(SB),NOSPLIT,$12-12 620 // If m is nil, Go did not create the current thread. 621 // Call needm to obtain one for temporary use. 622 // In this case, we're running on the thread stack, so there's 623 // lots of space, but the linker doesn't know. Hide the call from 624 // the linker analysis by using an indirect call through AX. 625 get_tls(CX) 626 #ifdef GOOS_windows 627 MOVL $0, BP 628 CMPL CX, $0 629 JEQ 2(PC) 630 #endif 631 MOVL m(CX), BP 632 MOVL BP, DX // saved copy of oldm 633 CMPL BP, $0 634 JNE havem 635 needm: 636 MOVL DX, 0(SP) 637 MOVL $runtime·needm(SB), AX 638 CALL AX 639 MOVL 0(SP), DX 640 get_tls(CX) 641 MOVL m(CX), BP 642 643 havem: 644 // Now there's a valid m, and we're running on its m->g0. 645 // Save current m->g0->sched.sp on stack and then set it to SP. 646 // Save current sp in m->g0->sched.sp in preparation for 647 // switch back to m->curg stack. 648 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 649 // On Windows, the SEH is at 4(SP) and 8(SP). 650 MOVL m_g0(BP), SI 651 MOVL (g_sched+gobuf_sp)(SI), AX 652 MOVL AX, 0(SP) 653 MOVL SP, (g_sched+gobuf_sp)(SI) 654 655 // Switch to m->curg stack and call runtime.cgocallbackg. 656 // Because we are taking over the execution of m->curg 657 // but *not* resuming what had been running, we need to 658 // save that information (m->curg->sched) so we can restore it. 659 // We can restore m->curg->sched.sp easily, because calling 660 // runtime.cgocallbackg leaves SP unchanged upon return. 661 // To save m->curg->sched.pc, we push it onto the stack. 662 // This has the added benefit that it looks to the traceback 663 // routine like cgocallbackg is going to return to that 664 // PC (because the frame we allocate below has the same 665 // size as cgocallback_gofunc's frame declared above) 666 // so that the traceback will seamlessly trace back into 667 // the earlier calls. 668 // 669 // In the new goroutine, 0(SP) holds the saved oldm (DX) register. 670 // 4(SP) and 8(SP) are unused. 671 MOVL m_curg(BP), SI 672 MOVL SI, g(CX) 673 MOVL (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 674 MOVL (g_sched+gobuf_pc)(SI), BP 675 MOVL BP, -4(DI) 676 LEAL -(4+12)(DI), SP 677 MOVL DX, 0(SP) 678 CALL runtime·cgocallbackg(SB) 679 MOVL 0(SP), DX 680 681 // Restore g->sched (== m->curg->sched) from saved values. 682 get_tls(CX) 683 MOVL g(CX), SI 684 MOVL 12(SP), BP 685 MOVL BP, (g_sched+gobuf_pc)(SI) 686 LEAL (12+4)(SP), DI 687 MOVL DI, (g_sched+gobuf_sp)(SI) 688 689 // Switch back to m->g0's stack and restore m->g0->sched.sp. 690 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 691 // so we do not have to restore it.) 692 MOVL m(CX), BP 693 MOVL m_g0(BP), SI 694 MOVL SI, g(CX) 695 MOVL (g_sched+gobuf_sp)(SI), SP 696 MOVL 0(SP), AX 697 MOVL AX, (g_sched+gobuf_sp)(SI) 698 699 // If the m on entry was nil, we called needm above to borrow an m 700 // for the duration of the call. Since the call is over, return it with dropm. 701 CMPL DX, $0 702 JNE 3(PC) 703 MOVL $runtime·dropm(SB), AX 704 CALL AX 705 706 // Done! 707 RET 708 709 // void setmg(M*, G*); set m and g. for use by needm. 710 TEXT runtime·setmg(SB), NOSPLIT, $0-8 711 #ifdef GOOS_windows 712 MOVL mm+0(FP), AX 713 CMPL AX, $0 714 JNE settls 715 MOVL $0, 0x14(FS) 716 RET 717 settls: 718 LEAL m_tls(AX), AX 719 MOVL AX, 0x14(FS) 720 #endif 721 MOVL mm+0(FP), AX 722 get_tls(CX) 723 MOVL mm+0(FP), AX 724 MOVL AX, m(CX) 725 MOVL gg+4(FP), BX 726 MOVL BX, g(CX) 727 RET 728 729 // void setmg_gcc(M*, G*); set m and g. for use by gcc 730 TEXT setmg_gcc<>(SB), NOSPLIT, $0 731 get_tls(AX) 732 MOVL mm+0(FP), DX 733 MOVL DX, m(AX) 734 MOVL gg+4(FP), DX 735 MOVL DX,g (AX) 736 RET 737 738 // check that SP is in range [g->stackbase, g->stackguard) 739 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0 740 get_tls(CX) 741 MOVL g(CX), AX 742 CMPL g_stackbase(AX), SP 743 JHI 2(PC) 744 INT $3 745 CMPL SP, g_stackguard(AX) 746 JHI 2(PC) 747 INT $3 748 RET 749 750 TEXT runtime·memclr(SB),NOSPLIT,$0-8 751 MOVL 4(SP), DI // arg 1 addr 752 MOVL 8(SP), CX // arg 2 count 753 MOVL CX, BX 754 ANDL $3, BX 755 SHRL $2, CX 756 MOVL $0, AX 757 CLD 758 REP 759 STOSL 760 MOVL BX, CX 761 REP 762 STOSB 763 RET 764 765 TEXT runtime·getcallerpc(SB),NOSPLIT,$0-4 766 MOVL x+0(FP),AX // addr of first arg 767 MOVL -4(AX),AX // get calling pc 768 RET 769 770 TEXT runtime·setcallerpc(SB),NOSPLIT,$0-8 771 MOVL x+0(FP),AX // addr of first arg 772 MOVL x+4(FP), BX 773 MOVL BX, -4(AX) // set calling pc 774 RET 775 776 TEXT runtime·getcallersp(SB), NOSPLIT, $0-4 777 MOVL sp+0(FP), AX 778 RET 779 780 // int64 runtime·cputicks(void), so really 781 // void runtime·cputicks(int64 *ticks) 782 TEXT runtime·cputicks(SB),NOSPLIT,$0-4 783 RDTSC 784 MOVL ret+0(FP), DI 785 MOVL AX, 0(DI) 786 MOVL DX, 4(DI) 787 RET 788 789 TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0 790 // set up ldt 7 to point at tls0 791 // ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go. 792 // the entry number is just a hint. setldt will set up GS with what it used. 793 MOVL $7, 0(SP) 794 LEAL runtime·tls0(SB), AX 795 MOVL AX, 4(SP) 796 MOVL $32, 8(SP) // sizeof(tls array) 797 CALL runtime·setldt(SB) 798 RET 799 800 TEXT runtime·emptyfunc(SB),0,$0-0 801 RET 802 803 TEXT runtime·abort(SB),NOSPLIT,$0-0 804 INT $0x3 805 806 TEXT runtime·stackguard(SB),NOSPLIT,$0-8 807 MOVL SP, DX 808 MOVL DX, sp+0(FP) 809 get_tls(CX) 810 MOVL g(CX), BX 811 MOVL g_stackguard(BX), DX 812 MOVL DX, limit+4(FP) 813 RET 814 815 GLOBL runtime·tls0(SB), $32 816 817 // hash function using AES hardware instructions 818 TEXT runtime·aeshash(SB),NOSPLIT,$0-12 819 MOVL 4(SP), DX // ptr to hash value 820 MOVL 8(SP), CX // size 821 MOVL 12(SP), AX // ptr to data 822 JMP runtime·aeshashbody(SB) 823 824 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12 825 MOVL 4(SP), DX // ptr to hash value 826 MOVL 12(SP), AX // ptr to string struct 827 MOVL 4(AX), CX // length of string 828 MOVL (AX), AX // string data 829 JMP runtime·aeshashbody(SB) 830 831 // AX: data 832 // CX: length 833 // DX: ptr to seed input / hash output 834 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-12 835 MOVL (DX), X0 // seed to low 32 bits of xmm0 836 PINSRD $1, CX, X0 // size to next 32 bits of xmm0 837 MOVO runtime·aeskeysched+0(SB), X2 838 MOVO runtime·aeskeysched+16(SB), X3 839 CMPL CX, $16 840 JB aessmall 841 aesloop: 842 CMPL CX, $16 843 JBE aesloopend 844 MOVOU (AX), X1 845 AESENC X2, X0 846 AESENC X1, X0 847 SUBL $16, CX 848 ADDL $16, AX 849 JMP aesloop 850 // 1-16 bytes remaining 851 aesloopend: 852 // This load may overlap with the previous load above. 853 // We'll hash some bytes twice, but that's ok. 854 MOVOU -16(AX)(CX*1), X1 855 JMP partial 856 // 0-15 bytes 857 aessmall: 858 TESTL CX, CX 859 JE finalize // 0 bytes 860 861 CMPB AX, $0xf0 862 JA highpartial 863 864 // 16 bytes loaded at this address won't cross 865 // a page boundary, so we can load it directly. 866 MOVOU (AX), X1 867 ADDL CX, CX 868 PAND masks<>(SB)(CX*8), X1 869 JMP partial 870 highpartial: 871 // address ends in 1111xxxx. Might be up against 872 // a page boundary, so load ending at last byte. 873 // Then shift bytes down using pshufb. 874 MOVOU -16(AX)(CX*1), X1 875 ADDL CX, CX 876 PSHUFB shifts<>(SB)(CX*8), X1 877 partial: 878 // incorporate partial block into hash 879 AESENC X3, X0 880 AESENC X1, X0 881 finalize: 882 // finalize hash 883 AESENC X2, X0 884 AESENC X3, X0 885 AESENC X2, X0 886 MOVL X0, (DX) 887 RET 888 889 TEXT runtime·aeshash32(SB),NOSPLIT,$0-12 890 MOVL 4(SP), DX // ptr to hash value 891 MOVL 12(SP), AX // ptr to data 892 MOVL (DX), X0 // seed 893 PINSRD $1, (AX), X0 // data 894 AESENC runtime·aeskeysched+0(SB), X0 895 AESENC runtime·aeskeysched+16(SB), X0 896 AESENC runtime·aeskeysched+0(SB), X0 897 MOVL X0, (DX) 898 RET 899 900 TEXT runtime·aeshash64(SB),NOSPLIT,$0-12 901 MOVL 4(SP), DX // ptr to hash value 902 MOVL 12(SP), AX // ptr to data 903 MOVQ (AX), X0 // data 904 PINSRD $2, (DX), X0 // seed 905 AESENC runtime·aeskeysched+0(SB), X0 906 AESENC runtime·aeskeysched+16(SB), X0 907 AESENC runtime·aeskeysched+0(SB), X0 908 MOVL X0, (DX) 909 RET 910 911 // simple mask to get rid of data in the high part of the register. 912 DATA masks<>+0x00(SB)/4, $0x00000000 913 DATA masks<>+0x04(SB)/4, $0x00000000 914 DATA masks<>+0x08(SB)/4, $0x00000000 915 DATA masks<>+0x0c(SB)/4, $0x00000000 916 917 DATA masks<>+0x10(SB)/4, $0x000000ff 918 DATA masks<>+0x14(SB)/4, $0x00000000 919 DATA masks<>+0x18(SB)/4, $0x00000000 920 DATA masks<>+0x1c(SB)/4, $0x00000000 921 922 DATA masks<>+0x20(SB)/4, $0x0000ffff 923 DATA masks<>+0x24(SB)/4, $0x00000000 924 DATA masks<>+0x28(SB)/4, $0x00000000 925 DATA masks<>+0x2c(SB)/4, $0x00000000 926 927 DATA masks<>+0x30(SB)/4, $0x00ffffff 928 DATA masks<>+0x34(SB)/4, $0x00000000 929 DATA masks<>+0x38(SB)/4, $0x00000000 930 DATA masks<>+0x3c(SB)/4, $0x00000000 931 932 DATA masks<>+0x40(SB)/4, $0xffffffff 933 DATA masks<>+0x44(SB)/4, $0x00000000 934 DATA masks<>+0x48(SB)/4, $0x00000000 935 DATA masks<>+0x4c(SB)/4, $0x00000000 936 937 DATA masks<>+0x50(SB)/4, $0xffffffff 938 DATA masks<>+0x54(SB)/4, $0x000000ff 939 DATA masks<>+0x58(SB)/4, $0x00000000 940 DATA masks<>+0x5c(SB)/4, $0x00000000 941 942 DATA masks<>+0x60(SB)/4, $0xffffffff 943 DATA masks<>+0x64(SB)/4, $0x0000ffff 944 DATA masks<>+0x68(SB)/4, $0x00000000 945 DATA masks<>+0x6c(SB)/4, $0x00000000 946 947 DATA masks<>+0x70(SB)/4, $0xffffffff 948 DATA masks<>+0x74(SB)/4, $0x00ffffff 949 DATA masks<>+0x78(SB)/4, $0x00000000 950 DATA masks<>+0x7c(SB)/4, $0x00000000 951 952 DATA masks<>+0x80(SB)/4, $0xffffffff 953 DATA masks<>+0x84(SB)/4, $0xffffffff 954 DATA masks<>+0x88(SB)/4, $0x00000000 955 DATA masks<>+0x8c(SB)/4, $0x00000000 956 957 DATA masks<>+0x90(SB)/4, $0xffffffff 958 DATA masks<>+0x94(SB)/4, $0xffffffff 959 DATA masks<>+0x98(SB)/4, $0x000000ff 960 DATA masks<>+0x9c(SB)/4, $0x00000000 961 962 DATA masks<>+0xa0(SB)/4, $0xffffffff 963 DATA masks<>+0xa4(SB)/4, $0xffffffff 964 DATA masks<>+0xa8(SB)/4, $0x0000ffff 965 DATA masks<>+0xac(SB)/4, $0x00000000 966 967 DATA masks<>+0xb0(SB)/4, $0xffffffff 968 DATA masks<>+0xb4(SB)/4, $0xffffffff 969 DATA masks<>+0xb8(SB)/4, $0x00ffffff 970 DATA masks<>+0xbc(SB)/4, $0x00000000 971 972 DATA masks<>+0xc0(SB)/4, $0xffffffff 973 DATA masks<>+0xc4(SB)/4, $0xffffffff 974 DATA masks<>+0xc8(SB)/4, $0xffffffff 975 DATA masks<>+0xcc(SB)/4, $0x00000000 976 977 DATA masks<>+0xd0(SB)/4, $0xffffffff 978 DATA masks<>+0xd4(SB)/4, $0xffffffff 979 DATA masks<>+0xd8(SB)/4, $0xffffffff 980 DATA masks<>+0xdc(SB)/4, $0x000000ff 981 982 DATA masks<>+0xe0(SB)/4, $0xffffffff 983 DATA masks<>+0xe4(SB)/4, $0xffffffff 984 DATA masks<>+0xe8(SB)/4, $0xffffffff 985 DATA masks<>+0xec(SB)/4, $0x0000ffff 986 987 DATA masks<>+0xf0(SB)/4, $0xffffffff 988 DATA masks<>+0xf4(SB)/4, $0xffffffff 989 DATA masks<>+0xf8(SB)/4, $0xffffffff 990 DATA masks<>+0xfc(SB)/4, $0x00ffffff 991 992 GLOBL masks<>(SB),RODATA,$256 993 994 // these are arguments to pshufb. They move data down from 995 // the high bytes of the register to the low bytes of the register. 996 // index is how many bytes to move. 997 DATA shifts<>+0x00(SB)/4, $0x00000000 998 DATA shifts<>+0x04(SB)/4, $0x00000000 999 DATA shifts<>+0x08(SB)/4, $0x00000000 1000 DATA shifts<>+0x0c(SB)/4, $0x00000000 1001 1002 DATA shifts<>+0x10(SB)/4, $0xffffff0f 1003 DATA shifts<>+0x14(SB)/4, $0xffffffff 1004 DATA shifts<>+0x18(SB)/4, $0xffffffff 1005 DATA shifts<>+0x1c(SB)/4, $0xffffffff 1006 1007 DATA shifts<>+0x20(SB)/4, $0xffff0f0e 1008 DATA shifts<>+0x24(SB)/4, $0xffffffff 1009 DATA shifts<>+0x28(SB)/4, $0xffffffff 1010 DATA shifts<>+0x2c(SB)/4, $0xffffffff 1011 1012 DATA shifts<>+0x30(SB)/4, $0xff0f0e0d 1013 DATA shifts<>+0x34(SB)/4, $0xffffffff 1014 DATA shifts<>+0x38(SB)/4, $0xffffffff 1015 DATA shifts<>+0x3c(SB)/4, $0xffffffff 1016 1017 DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c 1018 DATA shifts<>+0x44(SB)/4, $0xffffffff 1019 DATA shifts<>+0x48(SB)/4, $0xffffffff 1020 DATA shifts<>+0x4c(SB)/4, $0xffffffff 1021 1022 DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b 1023 DATA shifts<>+0x54(SB)/4, $0xffffff0f 1024 DATA shifts<>+0x58(SB)/4, $0xffffffff 1025 DATA shifts<>+0x5c(SB)/4, $0xffffffff 1026 1027 DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a 1028 DATA shifts<>+0x64(SB)/4, $0xffff0f0e 1029 DATA shifts<>+0x68(SB)/4, $0xffffffff 1030 DATA shifts<>+0x6c(SB)/4, $0xffffffff 1031 1032 DATA shifts<>+0x70(SB)/4, $0x0c0b0a09 1033 DATA shifts<>+0x74(SB)/4, $0xff0f0e0d 1034 DATA shifts<>+0x78(SB)/4, $0xffffffff 1035 DATA shifts<>+0x7c(SB)/4, $0xffffffff 1036 1037 DATA shifts<>+0x80(SB)/4, $0x0b0a0908 1038 DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c 1039 DATA shifts<>+0x88(SB)/4, $0xffffffff 1040 DATA shifts<>+0x8c(SB)/4, $0xffffffff 1041 1042 DATA shifts<>+0x90(SB)/4, $0x0a090807 1043 DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b 1044 DATA shifts<>+0x98(SB)/4, $0xffffff0f 1045 DATA shifts<>+0x9c(SB)/4, $0xffffffff 1046 1047 DATA shifts<>+0xa0(SB)/4, $0x09080706 1048 DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a 1049 DATA shifts<>+0xa8(SB)/4, $0xffff0f0e 1050 DATA shifts<>+0xac(SB)/4, $0xffffffff 1051 1052 DATA shifts<>+0xb0(SB)/4, $0x08070605 1053 DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09 1054 DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d 1055 DATA shifts<>+0xbc(SB)/4, $0xffffffff 1056 1057 DATA shifts<>+0xc0(SB)/4, $0x07060504 1058 DATA shifts<>+0xc4(SB)/4, $0x0b0a0908 1059 DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c 1060 DATA shifts<>+0xcc(SB)/4, $0xffffffff 1061 1062 DATA shifts<>+0xd0(SB)/4, $0x06050403 1063 DATA shifts<>+0xd4(SB)/4, $0x0a090807 1064 DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b 1065 DATA shifts<>+0xdc(SB)/4, $0xffffff0f 1066 1067 DATA shifts<>+0xe0(SB)/4, $0x05040302 1068 DATA shifts<>+0xe4(SB)/4, $0x09080706 1069 DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a 1070 DATA shifts<>+0xec(SB)/4, $0xffff0f0e 1071 1072 DATA shifts<>+0xf0(SB)/4, $0x04030201 1073 DATA shifts<>+0xf4(SB)/4, $0x08070605 1074 DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09 1075 DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d 1076 1077 GLOBL shifts<>(SB),RODATA,$256 1078 1079 TEXT runtime·memeq(SB),NOSPLIT,$0-12 1080 MOVL a+0(FP), SI 1081 MOVL b+4(FP), DI 1082 MOVL count+8(FP), BX 1083 JMP runtime·memeqbody(SB) 1084 1085 TEXT bytes·Equal(SB),NOSPLIT,$0-25 1086 MOVL a_len+4(FP), BX 1087 MOVL b_len+16(FP), CX 1088 XORL AX, AX 1089 CMPL BX, CX 1090 JNE eqret 1091 MOVL a+0(FP), SI 1092 MOVL b+12(FP), DI 1093 CALL runtime·memeqbody(SB) 1094 eqret: 1095 MOVB AX, ret+24(FP) 1096 RET 1097 1098 // a in SI 1099 // b in DI 1100 // count in BX 1101 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 1102 XORL AX, AX 1103 1104 CMPL BX, $4 1105 JB small 1106 1107 // 64 bytes at a time using xmm registers 1108 hugeloop: 1109 CMPL BX, $64 1110 JB bigloop 1111 TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 1112 JE bigloop 1113 MOVOU (SI), X0 1114 MOVOU (DI), X1 1115 MOVOU 16(SI), X2 1116 MOVOU 16(DI), X3 1117 MOVOU 32(SI), X4 1118 MOVOU 32(DI), X5 1119 MOVOU 48(SI), X6 1120 MOVOU 48(DI), X7 1121 PCMPEQB X1, X0 1122 PCMPEQB X3, X2 1123 PCMPEQB X5, X4 1124 PCMPEQB X7, X6 1125 PAND X2, X0 1126 PAND X6, X4 1127 PAND X4, X0 1128 PMOVMSKB X0, DX 1129 ADDL $64, SI 1130 ADDL $64, DI 1131 SUBL $64, BX 1132 CMPL DX, $0xffff 1133 JEQ hugeloop 1134 RET 1135 1136 // 4 bytes at a time using 32-bit register 1137 bigloop: 1138 CMPL BX, $4 1139 JBE leftover 1140 MOVL (SI), CX 1141 MOVL (DI), DX 1142 ADDL $4, SI 1143 ADDL $4, DI 1144 SUBL $4, BX 1145 CMPL CX, DX 1146 JEQ bigloop 1147 RET 1148 1149 // remaining 0-4 bytes 1150 leftover: 1151 MOVL -4(SI)(BX*1), CX 1152 MOVL -4(DI)(BX*1), DX 1153 CMPL CX, DX 1154 SETEQ AX 1155 RET 1156 1157 small: 1158 CMPL BX, $0 1159 JEQ equal 1160 1161 LEAL 0(BX*8), CX 1162 NEGL CX 1163 1164 MOVL SI, DX 1165 CMPB DX, $0xfc 1166 JA si_high 1167 1168 // load at SI won't cross a page boundary. 1169 MOVL (SI), SI 1170 JMP si_finish 1171 si_high: 1172 // address ends in 111111xx. Load up to bytes we want, move to correct position. 1173 MOVL -4(SI)(BX*1), SI 1174 SHRL CX, SI 1175 si_finish: 1176 1177 // same for DI. 1178 MOVL DI, DX 1179 CMPB DX, $0xfc 1180 JA di_high 1181 MOVL (DI), DI 1182 JMP di_finish 1183 di_high: 1184 MOVL -4(DI)(BX*1), DI 1185 SHRL CX, DI 1186 di_finish: 1187 1188 SUBL SI, DI 1189 SHLL CX, DI 1190 equal: 1191 SETEQ AX 1192 RET 1193 1194 TEXT runtime·cmpstring(SB),NOSPLIT,$0-20 1195 MOVL s1+0(FP), SI 1196 MOVL s1+4(FP), BX 1197 MOVL s2+8(FP), DI 1198 MOVL s2+12(FP), DX 1199 CALL runtime·cmpbody(SB) 1200 MOVL AX, res+16(FP) 1201 RET 1202 1203 TEXT bytes·Compare(SB),NOSPLIT,$0-28 1204 MOVL s1+0(FP), SI 1205 MOVL s1+4(FP), BX 1206 MOVL s2+12(FP), DI 1207 MOVL s2+16(FP), DX 1208 CALL runtime·cmpbody(SB) 1209 MOVL AX, res+24(FP) 1210 RET 1211 1212 TEXT bytes·IndexByte(SB),NOSPLIT,$0 1213 MOVL s+0(FP), SI 1214 MOVL s_len+4(FP), CX 1215 MOVB c+12(FP), AL 1216 MOVL SI, DI 1217 CLD; REPN; SCASB 1218 JZ 3(PC) 1219 MOVL $-1, ret+16(FP) 1220 RET 1221 SUBL SI, DI 1222 SUBL $1, DI 1223 MOVL DI, ret+16(FP) 1224 RET 1225 1226 TEXT strings·IndexByte(SB),NOSPLIT,$0 1227 MOVL s+0(FP), SI 1228 MOVL s_len+4(FP), CX 1229 MOVB c+8(FP), AL 1230 MOVL SI, DI 1231 CLD; REPN; SCASB 1232 JZ 3(PC) 1233 MOVL $-1, ret+12(FP) 1234 RET 1235 SUBL SI, DI 1236 SUBL $1, DI 1237 MOVL DI, ret+12(FP) 1238 RET 1239 1240 // input: 1241 // SI = a 1242 // DI = b 1243 // BX = alen 1244 // DX = blen 1245 // output: 1246 // AX = 1/0/-1 1247 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 1248 CMPL SI, DI 1249 JEQ cmp_allsame 1250 CMPL BX, DX 1251 MOVL DX, BP 1252 CMOVLLT BX, BP // BP = min(alen, blen) 1253 CMPL BP, $4 1254 JB cmp_small 1255 TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 1256 JE cmp_mediumloop 1257 cmp_largeloop: 1258 CMPL BP, $16 1259 JB cmp_mediumloop 1260 MOVOU (SI), X0 1261 MOVOU (DI), X1 1262 PCMPEQB X0, X1 1263 PMOVMSKB X1, AX 1264 XORL $0xffff, AX // convert EQ to NE 1265 JNE cmp_diff16 // branch if at least one byte is not equal 1266 ADDL $16, SI 1267 ADDL $16, DI 1268 SUBL $16, BP 1269 JMP cmp_largeloop 1270 1271 cmp_diff16: 1272 BSFL AX, BX // index of first byte that differs 1273 XORL AX, AX 1274 MOVB (SI)(BX*1), CX 1275 CMPB CX, (DI)(BX*1) 1276 SETHI AX 1277 LEAL -1(AX*2), AX // convert 1/0 to +1/-1 1278 RET 1279 1280 cmp_mediumloop: 1281 CMPL BP, $4 1282 JBE cmp_0through4 1283 MOVL (SI), AX 1284 MOVL (DI), CX 1285 CMPL AX, CX 1286 JNE cmp_diff4 1287 ADDL $4, SI 1288 ADDL $4, DI 1289 SUBL $4, BP 1290 JMP cmp_mediumloop 1291 1292 cmp_0through4: 1293 MOVL -4(SI)(BP*1), AX 1294 MOVL -4(DI)(BP*1), CX 1295 CMPL AX, CX 1296 JEQ cmp_allsame 1297 1298 cmp_diff4: 1299 BSWAPL AX // reverse order of bytes 1300 BSWAPL CX 1301 XORL AX, CX // find bit differences 1302 BSRL CX, CX // index of highest bit difference 1303 SHRL CX, AX // move a's bit to bottom 1304 ANDL $1, AX // mask bit 1305 LEAL -1(AX*2), AX // 1/0 => +1/-1 1306 RET 1307 1308 // 0-3 bytes in common 1309 cmp_small: 1310 LEAL (BP*8), CX 1311 NEGL CX 1312 JEQ cmp_allsame 1313 1314 // load si 1315 CMPB SI, $0xfc 1316 JA cmp_si_high 1317 MOVL (SI), SI 1318 JMP cmp_si_finish 1319 cmp_si_high: 1320 MOVL -4(SI)(BP*1), SI 1321 SHRL CX, SI 1322 cmp_si_finish: 1323 SHLL CX, SI 1324 1325 // same for di 1326 CMPB DI, $0xfc 1327 JA cmp_di_high 1328 MOVL (DI), DI 1329 JMP cmp_di_finish 1330 cmp_di_high: 1331 MOVL -4(DI)(BP*1), DI 1332 SHRL CX, DI 1333 cmp_di_finish: 1334 SHLL CX, DI 1335 1336 BSWAPL SI // reverse order of bytes 1337 BSWAPL DI 1338 XORL SI, DI // find bit differences 1339 JEQ cmp_allsame 1340 BSRL DI, CX // index of highest bit difference 1341 SHRL CX, SI // move a's bit to bottom 1342 ANDL $1, SI // mask bit 1343 LEAL -1(SI*2), AX // 1/0 => +1/-1 1344 RET 1345 1346 // all the bytes in common are the same, so we just need 1347 // to compare the lengths. 1348 cmp_allsame: 1349 XORL AX, AX 1350 XORL CX, CX 1351 CMPL BX, DX 1352 SETGT AX // 1 if alen > blen 1353 SETEQ CX // 1 if alen == blen 1354 LEAL -1(CX)(AX*2), AX // 1,0,-1 result 1355 RET