github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/aorrlsh_n.asm (about) 1 dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. 2 3 dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C AMD K8,K9 3.1 < 3.85 for lshift + add_n 36 C AMD K10 3.1 < 3.85 for lshift + add_n 37 C Intel P4 14.6 > 7.33 for lshift + add_n 38 C Intel core2 3.87 > 3.27 for lshift + add_n 39 C Intel NHM 4 > 3.75 for lshift + add_n 40 C Intel SBR (5.8) > 3.46 for lshift + add_n 41 C Intel atom (7.75) < 8.75 for lshift + add_n 42 C VIA nano 4.7 < 6.25 for lshift + add_n 43 44 C This was written quickly and not optimized at all. Surely one could get 45 C closer to 3 c/l or perhaps even under 3 c/l. Ideas: 46 C 1) Use indexing to save the 3 LEA 47 C 2) Write reasonable feed-in code 48 C 3) Be more clever about register usage 49 C 4) Unroll more, handling CL negation, carry save/restore cost much now 50 C 5) Reschedule 51 52 C INPUT PARAMETERS 53 define(`rp', `%rdi') 54 define(`up', `%rsi') 55 define(`vp', `%rdx') 56 define(`n', `%rcx') 57 define(`cnt', `%r8') 58 59 ifdef(`OPERATION_addlsh_n',` 60 define(ADCSBB, `adc') 61 define(func, mpn_addlsh_n) 62 ') 63 ifdef(`OPERATION_rsblsh_n',` 64 define(ADCSBB, `sbb') 65 define(func, mpn_rsblsh_n) 66 ') 67 68 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) 69 70 ABI_SUPPORT(DOS64) 71 ABI_SUPPORT(STD64) 72 73 ASM_START() 74 TEXT 75 ALIGN(16) 76 PROLOGUE(func) 77 FUNC_ENTRY(4) 78 IFDOS(` mov 56(%rsp), %r8d ') 79 push %r12 80 push %r13 81 push %r14 82 push %rbp 83 push %rbx 84 85 mov n, %rax 86 xor R32(%rbx), R32(%rbx) C clear carry save register 87 mov R32(%r8), R32(%rcx) C shift count 88 xor R32(%rbp), R32(%rbp) C limb carry 89 90 mov R32(%rax), R32(%r11) 91 and $3, R32(%r11) 92 je L(4) 93 sub $1, R32(%r11) 94 95 L(012): mov (vp), %r8 96 mov %r8, %r12 97 shl R8(%rcx), %r8 98 or %rbp, %r8 99 neg R8(%rcx) 100 mov %r12, %rbp 101 shr R8(%rcx), %rbp 102 neg R8(%rcx) 103 add R32(%rbx), R32(%rbx) 104 ADCSBB (up), %r8 105 mov %r8, (rp) 106 sbb R32(%rbx), R32(%rbx) 107 lea 8(up), up 108 lea 8(vp), vp 109 lea 8(rp), rp 110 sub $1, R32(%r11) 111 jnc L(012) 112 113 L(4): sub $4, %rax 114 jc L(end) 115 116 ALIGN(16) 117 L(top): mov (vp), %r8 118 mov %r8, %r12 119 mov 8(vp), %r9 120 mov %r9, %r13 121 mov 16(vp), %r10 122 mov %r10, %r14 123 mov 24(vp), %r11 124 125 shl R8(%rcx), %r8 126 shl R8(%rcx), %r9 127 shl R8(%rcx), %r10 128 or %rbp, %r8 129 mov %r11, %rbp 130 shl R8(%rcx), %r11 131 132 neg R8(%rcx) 133 134 shr R8(%rcx), %r12 135 shr R8(%rcx), %r13 136 shr R8(%rcx), %r14 137 shr R8(%rcx), %rbp C used next iteration 138 139 or %r12, %r9 140 or %r13, %r10 141 or %r14, %r11 142 143 neg R8(%rcx) 144 145 add R32(%rbx), R32(%rbx) C restore carry flag 146 147 ADCSBB (up), %r8 148 ADCSBB 8(up), %r9 149 ADCSBB 16(up), %r10 150 ADCSBB 24(up), %r11 151 152 mov %r8, (rp) 153 mov %r9, 8(rp) 154 mov %r10, 16(rp) 155 mov %r11, 24(rp) 156 157 sbb R32(%rbx), R32(%rbx) C save carry flag 158 159 lea 32(up), up 160 lea 32(vp), vp 161 lea 32(rp), rp 162 163 sub $4, %rax 164 jnc L(top) 165 166 L(end): add R32(%rbx), R32(%rbx) 167 ADCSBB $0, %rbp 168 mov %rbp, %rax 169 pop %rbx 170 pop %rbp 171 pop %r14 172 pop %r13 173 pop %r12 174 FUNC_EXIT() 175 ret 176 EPILOGUE()