github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/sublsh1_n.asm (about) 1 dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) 2 3 dnl Copyright 2003, 2005-2007, 2011, 2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C AMD K8,K9 2.2 36 C AMD K10 2.2 37 C Intel P4 12.75 38 C Intel core2 3.45 39 C Intel corei ? 40 C Intel atom ? 41 C VIA nano 3.25 42 43 C Sometimes speed degenerates, supposedly related to that some operand 44 C alignments cause cache conflicts. 45 46 C The speed is limited by decoding/issue bandwidth. There are 26 instructions 47 C in the loop, which corresponds to 26/3/4 = 2.167 c/l. 48 49 C INPUT PARAMETERS 50 define(`rp',`%rdi') 51 define(`up',`%rsi') 52 define(`vp',`%rdx') 53 define(`n', `%rcx') 54 55 ABI_SUPPORT(DOS64) 56 ABI_SUPPORT(STD64) 57 58 ASM_START() 59 TEXT 60 ALIGN(16) 61 PROLOGUE(mpn_sublsh1_n) 62 FUNC_ENTRY(4) 63 push %rbx 64 push %rbp 65 66 mov (vp), %r8 67 mov R32(n), R32(%rax) 68 lea (rp,n,8), rp 69 lea (up,n,8), up 70 lea (vp,n,8), vp 71 neg n 72 xor R32(%rbp), R32(%rbp) 73 and $3, R32(%rax) 74 je L(b00) 75 cmp $2, R32(%rax) 76 jc L(b01) 77 je L(b10) 78 79 L(b11): add %r8, %r8 80 mov 8(vp,n,8), %r9 81 adc %r9, %r9 82 mov 16(vp,n,8), %r10 83 adc %r10, %r10 84 sbb R32(%rax), R32(%rax) C save scy 85 mov (up,n,8), %rbp 86 mov 8(up,n,8), %rbx 87 sub %r8, %rbp 88 sbb %r9, %rbx 89 mov %rbp, (rp,n,8) 90 mov %rbx, 8(rp,n,8) 91 mov 16(up,n,8), %rbp 92 sbb %r10, %rbp 93 mov %rbp, 16(rp,n,8) 94 sbb R32(%rbp), R32(%rbp) C save acy 95 add $3, n 96 jmp L(ent) 97 98 L(b10): add %r8, %r8 99 mov 8(vp,n,8), %r9 100 adc %r9, %r9 101 sbb R32(%rax), R32(%rax) C save scy 102 mov (up,n,8), %rbp 103 mov 8(up,n,8), %rbx 104 sub %r8, %rbp 105 sbb %r9, %rbx 106 mov %rbp, (rp,n,8) 107 mov %rbx, 8(rp,n,8) 108 sbb R32(%rbp), R32(%rbp) C save acy 109 add $2, n 110 jmp L(ent) 111 112 L(b01): add %r8, %r8 113 sbb R32(%rax), R32(%rax) C save scy 114 mov (up,n,8), %rbp 115 sub %r8, %rbp 116 mov %rbp, (rp,n,8) 117 sbb R32(%rbp), R32(%rbp) C save acy 118 inc n 119 L(ent): jns L(end) 120 121 ALIGN(16) 122 L(top): add R32(%rax), R32(%rax) C restore scy 123 124 mov (vp,n,8), %r8 125 L(b00): adc %r8, %r8 126 mov 8(vp,n,8), %r9 127 adc %r9, %r9 128 mov 16(vp,n,8), %r10 129 adc %r10, %r10 130 mov 24(vp,n,8), %r11 131 adc %r11, %r11 132 133 sbb R32(%rax), R32(%rax) C save scy 134 add R32(%rbp), R32(%rbp) C restore acy 135 136 mov (up,n,8), %rbp 137 mov 8(up,n,8), %rbx 138 sbb %r8, %rbp 139 sbb %r9, %rbx 140 mov %rbp, (rp,n,8) 141 mov %rbx, 8(rp,n,8) 142 mov 16(up,n,8), %rbp 143 mov 24(up,n,8), %rbx 144 sbb %r10, %rbp 145 sbb %r11, %rbx 146 mov %rbp, 16(rp,n,8) 147 mov %rbx, 24(rp,n,8) 148 149 sbb R32(%rbp), R32(%rbp) C save acy 150 add $4, n 151 js L(top) 152 153 L(end): add R32(%rbp), R32(%rax) 154 neg R32(%rax) 155 156 pop %rbp 157 pop %rbx 158 FUNC_EXIT() 159 ret 160 EPILOGUE()