github.com/prattmic/llgo-embedded@v0.0.0-20150820070356-41cfecea0e1e/third_party/gofrontend/libffi/src/x86/unix64.S (about) 1 /* ----------------------------------------------------------------------- 2 unix64.S - Copyright (c) 2013 The Written Word, Inc. 3 - Copyright (c) 2008 Red Hat, Inc 4 - Copyright (c) 2002 Bo Thorsen <bo@suse.de> 5 6 x86-64 Foreign Function Interface 7 8 Permission is hereby granted, free of charge, to any person obtaining 9 a copy of this software and associated documentation files (the 10 ``Software''), to deal in the Software without restriction, including 11 without limitation the rights to use, copy, modify, merge, publish, 12 distribute, sublicense, and/or sell copies of the Software, and to 13 permit persons to whom the Software is furnished to do so, subject to 14 the following conditions: 15 16 The above copyright notice and this permission notice shall be included 17 in all copies or substantial portions of the Software. 18 19 THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, 20 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 23 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 24 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. 27 ----------------------------------------------------------------------- */ 28 29 #ifdef __x86_64__ 30 #define LIBFFI_ASM 31 #include <fficonfig.h> 32 #include <ffi.h> 33 #include "internal64.h" 34 35 .text 36 37 #define C2(X, Y) X ## Y 38 #define C1(X, Y) C2(X, Y) 39 #ifdef __USER_LABEL_PREFIX__ 40 # define C(X) C1(__USER_LABEL_PREFIX__, X) 41 #else 42 # define C(X) X 43 #endif 44 45 #ifdef __APPLE__ 46 # define L(X) C1(L, X) 47 #else 48 # define L(X) C1(.L, X) 49 #endif 50 51 #ifdef __ELF__ 52 # define PLT(X) X@PLT 53 # define ENDF(X) .type X,@function; .size X, . - X 54 #else 55 # define PLT(X) X 56 # define ENDF(X) 57 #endif 58 59 /* This macro allows the safe creation of jump tables without an 60 actual table. The entry points into the table are all 8 bytes. 61 The use of ORG asserts that we're at the correct location. */ 62 /* ??? The clang assembler doesn't handle .org with symbolic expressions. */ 63 #if defined(__clang__) || defined(__APPLE__) 64 # define E(BASE, X) .balign 8 65 #else 66 # define E(BASE, X) .balign 8; .org BASE + X * 8 67 #endif 68 69 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, 70 void *raddr, void (*fnaddr)(void)); 71 72 Bit o trickiness here -- ARGS+BYTES is the base of the stack frame 73 for this function. This has been allocated by ffi_call. We also 74 deallocate some of the stack that has been alloca'd. */ 75 76 .balign 8 77 .globl C(ffi_call_unix64) 78 FFI_HIDDEN(C(ffi_call_unix64)) 79 80 C(ffi_call_unix64): 81 L(UW0): 82 movq (%rsp), %r10 /* Load return address. */ 83 leaq (%rdi, %rsi), %rax /* Find local stack base. */ 84 movq %rdx, (%rax) /* Save flags. */ 85 movq %rcx, 8(%rax) /* Save raddr. */ 86 movq %rbp, 16(%rax) /* Save old frame pointer. */ 87 movq %r10, 24(%rax) /* Relocate return address. */ 88 movq %rax, %rbp /* Finalize local stack frame. */ 89 90 /* New stack frame based off rbp. This is a itty bit of unwind 91 trickery in that the CFA *has* changed. There is no easy way 92 to describe it correctly on entry to the function. Fortunately, 93 it doesn't matter too much since at all points we can correctly 94 unwind back to ffi_call. Note that the location to which we 95 moved the return address is (the new) CFA-8, so from the 96 perspective of the unwind info, it hasn't moved. */ 97 L(UW1): 98 /* cfi_def_cfa(%rbp, 32) */ 99 /* cfi_rel_offset(%rbp, 16) */ 100 101 movq %rdi, %r10 /* Save a copy of the register area. */ 102 movq %r8, %r11 /* Save a copy of the target fn. */ 103 movl %r9d, %eax /* Set number of SSE registers. */ 104 105 /* Load up all argument registers. */ 106 movq (%r10), %rdi 107 movq 0x08(%r10), %rsi 108 movq 0x10(%r10), %rdx 109 movq 0x18(%r10), %rcx 110 movq 0x20(%r10), %r8 111 movq 0x28(%r10), %r9 112 movl 0xb0(%r10), %eax 113 testl %eax, %eax 114 jnz L(load_sse) 115 L(ret_from_load_sse): 116 117 /* Deallocate the reg arg area, except for r10, then load via pop. */ 118 leaq 0xb8(%r10), %rsp 119 popq %r10 120 121 /* Call the user function. */ 122 call *%r11 123 124 /* Deallocate stack arg area; local stack frame in redzone. */ 125 leaq 24(%rbp), %rsp 126 127 movq 0(%rbp), %rcx /* Reload flags. */ 128 movq 8(%rbp), %rdi /* Reload raddr. */ 129 movq 16(%rbp), %rbp /* Reload old frame pointer. */ 130 L(UW2): 131 /* cfi_remember_state */ 132 /* cfi_def_cfa(%rsp, 8) */ 133 /* cfi_restore(%rbp) */ 134 135 /* The first byte of the flags contains the FFI_TYPE. */ 136 cmpb $UNIX64_RET_LAST, %cl 137 movzbl %cl, %r10d 138 leaq L(store_table)(%rip), %r11 139 ja L(sa) 140 leaq (%r11, %r10, 8), %r10 141 142 /* Prep for the structure cases: scratch area in redzone. */ 143 leaq -20(%rsp), %rsi 144 jmp *%r10 145 146 .balign 8 147 L(store_table): 148 E(L(store_table), UNIX64_RET_VOID) 149 ret 150 E(L(store_table), UNIX64_RET_UINT8) 151 movzbl %al, %eax 152 movq %rax, (%rdi) 153 ret 154 E(L(store_table), UNIX64_RET_UINT16) 155 movzwl %ax, %eax 156 movq %rax, (%rdi) 157 ret 158 E(L(store_table), UNIX64_RET_UINT32) 159 movl %eax, %eax 160 movq %rax, (%rdi) 161 ret 162 E(L(store_table), UNIX64_RET_SINT8) 163 movsbq %al, %rax 164 movq %rax, (%rdi) 165 ret 166 E(L(store_table), UNIX64_RET_SINT16) 167 movswq %ax, %rax 168 movq %rax, (%rdi) 169 ret 170 E(L(store_table), UNIX64_RET_SINT32) 171 cltq 172 movq %rax, (%rdi) 173 ret 174 E(L(store_table), UNIX64_RET_INT64) 175 movq %rax, (%rdi) 176 ret 177 E(L(store_table), UNIX64_RET_XMM32) 178 movd %xmm0, (%rdi) 179 ret 180 E(L(store_table), UNIX64_RET_XMM64) 181 movq %xmm0, (%rdi) 182 ret 183 E(L(store_table), UNIX64_RET_X87) 184 fstpt (%rdi) 185 ret 186 E(L(store_table), UNIX64_RET_X87_2) 187 fstpt (%rdi) 188 fstpt 16(%rdi) 189 ret 190 E(L(store_table), UNIX64_RET_ST_XMM0_RAX) 191 movq %rax, 8(%rsi) 192 jmp L(s3) 193 E(L(store_table), UNIX64_RET_ST_RAX_XMM0) 194 movq %xmm0, 8(%rsi) 195 jmp L(s2) 196 E(L(store_table), UNIX64_RET_ST_XMM0_XMM1) 197 movq %xmm1, 8(%rsi) 198 jmp L(s3) 199 E(L(store_table), UNIX64_RET_ST_RAX_RDX) 200 movq %rdx, 8(%rsi) 201 L(s2): 202 movq %rax, (%rsi) 203 shrl $UNIX64_SIZE_SHIFT, %ecx 204 rep movsb 205 ret 206 .balign 8 207 L(s3): 208 movq %xmm0, (%rsi) 209 shrl $UNIX64_SIZE_SHIFT, %ecx 210 rep movsb 211 ret 212 213 L(sa): call PLT(C(abort)) 214 215 /* Many times we can avoid loading any SSE registers at all. 216 It's not worth an indirect jump to load the exact set of 217 SSE registers needed; zero or all is a good compromise. */ 218 .balign 2 219 L(UW3): 220 /* cfi_restore_state */ 221 L(load_sse): 222 movdqa 0x30(%r10), %xmm0 223 movdqa 0x40(%r10), %xmm1 224 movdqa 0x50(%r10), %xmm2 225 movdqa 0x60(%r10), %xmm3 226 movdqa 0x70(%r10), %xmm4 227 movdqa 0x80(%r10), %xmm5 228 movdqa 0x90(%r10), %xmm6 229 movdqa 0xa0(%r10), %xmm7 230 jmp L(ret_from_load_sse) 231 232 L(UW4): 233 ENDF(C(ffi_call_unix64)) 234 235 /* 6 general registers, 8 vector registers, 236 32 bytes of rvalue, 8 bytes of alignment. */ 237 #define ffi_closure_OFS_G 0 238 #define ffi_closure_OFS_V (6*8) 239 #define ffi_closure_OFS_RVALUE (ffi_closure_OFS_V + 8*16) 240 #define ffi_closure_FS (ffi_closure_OFS_RVALUE + 32 + 8) 241 242 /* The location of rvalue within the red zone after deallocating the frame. */ 243 #define ffi_closure_RED_RVALUE (ffi_closure_OFS_RVALUE - ffi_closure_FS) 244 245 .balign 2 246 .globl C(ffi_closure_unix64_sse) 247 FFI_HIDDEN(C(ffi_closure_unix64_sse)) 248 249 C(ffi_closure_unix64_sse): 250 L(UW5): 251 subq $ffi_closure_FS, %rsp 252 L(UW6): 253 /* cfi_adjust_cfa_offset(ffi_closure_FS) */ 254 255 movdqa %xmm0, ffi_closure_OFS_V+0x00(%rsp) 256 movdqa %xmm1, ffi_closure_OFS_V+0x10(%rsp) 257 movdqa %xmm2, ffi_closure_OFS_V+0x20(%rsp) 258 movdqa %xmm3, ffi_closure_OFS_V+0x30(%rsp) 259 movdqa %xmm4, ffi_closure_OFS_V+0x40(%rsp) 260 movdqa %xmm5, ffi_closure_OFS_V+0x50(%rsp) 261 movdqa %xmm6, ffi_closure_OFS_V+0x60(%rsp) 262 movdqa %xmm7, ffi_closure_OFS_V+0x70(%rsp) 263 jmp L(sse_entry1) 264 265 L(UW7): 266 ENDF(C(ffi_closure_unix64_sse)) 267 268 .balign 2 269 .globl C(ffi_closure_unix64) 270 FFI_HIDDEN(C(ffi_closure_unix64)) 271 272 C(ffi_closure_unix64): 273 L(UW8): 274 subq $ffi_closure_FS, %rsp 275 L(UW9): 276 /* cfi_adjust_cfa_offset(ffi_closure_FS) */ 277 L(sse_entry1): 278 movq %rdi, ffi_closure_OFS_G+0x00(%rsp) 279 movq %rsi, ffi_closure_OFS_G+0x08(%rsp) 280 movq %rdx, ffi_closure_OFS_G+0x10(%rsp) 281 movq %rcx, ffi_closure_OFS_G+0x18(%rsp) 282 movq %r8, ffi_closure_OFS_G+0x20(%rsp) 283 movq %r9, ffi_closure_OFS_G+0x28(%rsp) 284 285 #ifdef __ILP32__ 286 movl FFI_TRAMPOLINE_SIZE(%r10), %edi /* Load cif */ 287 movl FFI_TRAMPOLINE_SIZE+4(%r10), %esi /* Load fun */ 288 movl FFI_TRAMPOLINE_SIZE+8(%r10), %edx /* Load user_data */ 289 #else 290 movq FFI_TRAMPOLINE_SIZE(%r10), %rdi /* Load cif */ 291 movq FFI_TRAMPOLINE_SIZE+8(%r10), %rsi /* Load fun */ 292 movq FFI_TRAMPOLINE_SIZE+16(%r10), %rdx /* Load user_data */ 293 #endif 294 L(do_closure): 295 leaq ffi_closure_OFS_RVALUE(%rsp), %rcx /* Load rvalue */ 296 movq %rsp, %r8 /* Load reg_args */ 297 leaq ffi_closure_FS+8(%rsp), %r9 /* Load argp */ 298 call C(ffi_closure_unix64_inner) 299 300 /* Deallocate stack frame early; return value is now in redzone. */ 301 addq $ffi_closure_FS, %rsp 302 L(UW10): 303 /* cfi_adjust_cfa_offset(-ffi_closure_FS) */ 304 305 /* The first byte of the return value contains the FFI_TYPE. */ 306 cmpb $UNIX64_RET_LAST, %al 307 movzbl %al, %r10d 308 leaq L(load_table)(%rip), %r11 309 ja L(la) 310 leaq (%r11, %r10, 8), %r10 311 leaq ffi_closure_RED_RVALUE(%rsp), %rsi 312 jmp *%r10 313 314 .balign 8 315 L(load_table): 316 E(L(load_table), UNIX64_RET_VOID) 317 ret 318 E(L(load_table), UNIX64_RET_UINT8) 319 movzbl (%rsi), %eax 320 ret 321 E(L(load_table), UNIX64_RET_UINT16) 322 movzwl (%rsi), %eax 323 ret 324 E(L(load_table), UNIX64_RET_UINT32) 325 movl (%rsi), %eax 326 ret 327 E(L(load_table), UNIX64_RET_SINT8) 328 movsbl (%rsi), %eax 329 ret 330 E(L(load_table), UNIX64_RET_SINT16) 331 movswl (%rsi), %eax 332 ret 333 E(L(load_table), UNIX64_RET_SINT32) 334 movl (%rsi), %eax 335 ret 336 E(L(load_table), UNIX64_RET_INT64) 337 movq (%rsi), %rax 338 ret 339 E(L(load_table), UNIX64_RET_XMM32) 340 movd (%rsi), %xmm0 341 ret 342 E(L(load_table), UNIX64_RET_XMM64) 343 movq (%rsi), %xmm0 344 ret 345 E(L(load_table), UNIX64_RET_X87) 346 fldt (%rsi) 347 ret 348 E(L(load_table), UNIX64_RET_X87_2) 349 fldt 16(%rsi) 350 fldt (%rsi) 351 ret 352 E(L(load_table), UNIX64_RET_ST_XMM0_RAX) 353 movq 8(%rsi), %rax 354 jmp L(l3) 355 E(L(load_table), UNIX64_RET_ST_RAX_XMM0) 356 movq 8(%rsi), %xmm0 357 jmp L(l2) 358 E(L(load_table), UNIX64_RET_ST_XMM0_XMM1) 359 movq 8(%rsi), %xmm1 360 jmp L(l3) 361 E(L(load_table), UNIX64_RET_ST_RAX_RDX) 362 movq 8(%rsi), %rdx 363 L(l2): 364 movq (%rsi), %rax 365 ret 366 .balign 8 367 L(l3): 368 movq (%rsi), %xmm0 369 ret 370 371 L(la): call PLT(C(abort)) 372 373 L(UW11): 374 ENDF(C(ffi_closure_unix64)) 375 376 .balign 2 377 .globl C(ffi_go_closure_unix64_sse) 378 FFI_HIDDEN(C(ffi_go_closure_unix64_sse)) 379 380 C(ffi_go_closure_unix64_sse): 381 L(UW12): 382 subq $ffi_closure_FS, %rsp 383 L(UW13): 384 /* cfi_adjust_cfa_offset(ffi_closure_FS) */ 385 386 movdqa %xmm0, ffi_closure_OFS_V+0x00(%rsp) 387 movdqa %xmm1, ffi_closure_OFS_V+0x10(%rsp) 388 movdqa %xmm2, ffi_closure_OFS_V+0x20(%rsp) 389 movdqa %xmm3, ffi_closure_OFS_V+0x30(%rsp) 390 movdqa %xmm4, ffi_closure_OFS_V+0x40(%rsp) 391 movdqa %xmm5, ffi_closure_OFS_V+0x50(%rsp) 392 movdqa %xmm6, ffi_closure_OFS_V+0x60(%rsp) 393 movdqa %xmm7, ffi_closure_OFS_V+0x70(%rsp) 394 jmp L(sse_entry2) 395 396 L(UW14): 397 ENDF(C(ffi_go_closure_unix64_sse)) 398 399 .balign 2 400 .globl C(ffi_go_closure_unix64) 401 FFI_HIDDEN(C(ffi_go_closure_unix64)) 402 403 C(ffi_go_closure_unix64): 404 L(UW15): 405 subq $ffi_closure_FS, %rsp 406 L(UW16): 407 /* cfi_adjust_cfa_offset(ffi_closure_FS) */ 408 L(sse_entry2): 409 movq %rdi, ffi_closure_OFS_G+0x00(%rsp) 410 movq %rsi, ffi_closure_OFS_G+0x08(%rsp) 411 movq %rdx, ffi_closure_OFS_G+0x10(%rsp) 412 movq %rcx, ffi_closure_OFS_G+0x18(%rsp) 413 movq %r8, ffi_closure_OFS_G+0x20(%rsp) 414 movq %r9, ffi_closure_OFS_G+0x28(%rsp) 415 416 #ifdef __ILP32__ 417 movl 4(%r10), %edi /* Load cif */ 418 movl 8(%r10), %esi /* Load fun */ 419 movl %r10d, %edx /* Load closure (user_data) */ 420 #else 421 movq 8(%r10), %rdi /* Load cif */ 422 movq 16(%r10), %rsi /* Load fun */ 423 movq %r10, %rdx /* Load closure (user_data) */ 424 #endif 425 jmp L(do_closure) 426 427 L(UW17): 428 ENDF(C(ffi_go_closure_unix64)) 429 430 /* Sadly, OSX cctools-as doesn't understand .cfi directives at all. */ 431 432 #ifdef __APPLE__ 433 .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support 434 EHFrame0: 435 #elif defined(HAVE_AS_X86_64_UNWIND_SECTION_TYPE) 436 .section .eh_frame,"a",@unwind 437 #else 438 .section .eh_frame,"a",@progbits 439 #endif 440 441 #ifdef HAVE_AS_X86_PCREL 442 # define PCREL(X) X - . 443 #else 444 # define PCREL(X) X@rel 445 #endif 446 447 /* Simplify advancing between labels. Assume DW_CFA_advance_loc1 fits. */ 448 #define ADV(N, P) .byte 2, L(N)-L(P) 449 450 .balign 8 451 L(CIE): 452 .set L(set0),L(ECIE)-L(SCIE) 453 .long L(set0) /* CIE Length */ 454 L(SCIE): 455 .long 0 /* CIE Identifier Tag */ 456 .byte 1 /* CIE Version */ 457 .ascii "zR\0" /* CIE Augmentation */ 458 .byte 1 /* CIE Code Alignment Factor */ 459 .byte 0x78 /* CIE Data Alignment Factor */ 460 .byte 0x10 /* CIE RA Column */ 461 .byte 1 /* Augmentation size */ 462 .byte 0x1b /* FDE Encoding (pcrel sdata4) */ 463 .byte 0xc, 7, 8 /* DW_CFA_def_cfa, %rsp offset 8 */ 464 .byte 0x80+16, 1 /* DW_CFA_offset, %rip offset 1*-8 */ 465 .balign 8 466 L(ECIE): 467 468 .set L(set1),L(EFDE1)-L(SFDE1) 469 .long L(set1) /* FDE Length */ 470 L(SFDE1): 471 .long L(SFDE1)-L(CIE) /* FDE CIE offset */ 472 .long PCREL(L(UW0)) /* Initial location */ 473 .long L(UW4)-L(UW0) /* Address range */ 474 .byte 0 /* Augmentation size */ 475 ADV(UW1, UW0) 476 .byte 0xc, 6, 32 /* DW_CFA_def_cfa, %rbp 32 */ 477 .byte 0x80+6, 2 /* DW_CFA_offset, %rbp 2*-8 */ 478 ADV(UW2, UW1) 479 .byte 0xa /* DW_CFA_remember_state */ 480 .byte 0xc, 7, 8 /* DW_CFA_def_cfa, %rsp 8 */ 481 .byte 0xc0+6 /* DW_CFA_restore, %rbp */ 482 ADV(UW3, UW2) 483 .byte 0xb /* DW_CFA_restore_state */ 484 .balign 8 485 L(EFDE1): 486 487 .set L(set2),L(EFDE2)-L(SFDE2) 488 .long L(set2) /* FDE Length */ 489 L(SFDE2): 490 .long L(SFDE2)-L(CIE) /* FDE CIE offset */ 491 .long PCREL(L(UW5)) /* Initial location */ 492 .long L(UW7)-L(UW5) /* Address range */ 493 .byte 0 /* Augmentation size */ 494 ADV(UW6, UW5) 495 .byte 0xe /* DW_CFA_def_cfa_offset */ 496 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */ 497 .balign 8 498 L(EFDE2): 499 500 .set L(set3),L(EFDE3)-L(SFDE3) 501 .long L(set3) /* FDE Length */ 502 L(SFDE3): 503 .long L(SFDE3)-L(CIE) /* FDE CIE offset */ 504 .long PCREL(L(UW8)) /* Initial location */ 505 .long L(UW11)-L(UW8) /* Address range */ 506 .byte 0 /* Augmentation size */ 507 ADV(UW9, UW8) 508 .byte 0xe /* DW_CFA_def_cfa_offset */ 509 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */ 510 ADV(UW10, UW9) 511 .byte 0xe, 8 /* DW_CFA_def_cfa_offset 8 */ 512 L(EFDE3): 513 514 .set L(set4),L(EFDE4)-L(SFDE4) 515 .long L(set4) /* FDE Length */ 516 L(SFDE4): 517 .long L(SFDE4)-L(CIE) /* FDE CIE offset */ 518 .long PCREL(L(UW12)) /* Initial location */ 519 .long L(UW14)-L(UW12) /* Address range */ 520 .byte 0 /* Augmentation size */ 521 ADV(UW13, UW12) 522 .byte 0xe /* DW_CFA_def_cfa_offset */ 523 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */ 524 .balign 8 525 L(EFDE4): 526 527 .set L(set5),L(EFDE5)-L(SFDE5) 528 .long L(set5) /* FDE Length */ 529 L(SFDE5): 530 .long L(SFDE5)-L(CIE) /* FDE CIE offset */ 531 .long PCREL(L(UW15)) /* Initial location */ 532 .long L(UW17)-L(UW15) /* Address range */ 533 .byte 0 /* Augmentation size */ 534 ADV(UW16, UW15) 535 .byte 0xe /* DW_CFA_def_cfa_offset */ 536 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */ 537 .balign 8 538 L(EFDE5): 539 #ifdef __APPLE__ 540 .subsections_via_symbols 541 #endif 542 543 #endif /* __x86_64__ */ 544 #if defined __ELF__ && defined __linux__ 545 .section .note.GNU-stack,"",@progbits 546 #endif