github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/lshsub_n.asm (about) 1 dnl AMD64 mpn_lshsub_n. R = 2^k(U - V). 2 3 dnl Copyright 2006, 2011, 2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C AMD K8,K9 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) 36 C AMD K10 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) 37 C Intel P4 16.5 38 C Intel core2 4.35 39 C Intel corei ? 40 C Intel atom ? 41 C VIA nano ? 42 43 C This was written quickly and not optimized at all, but it runs very well on 44 C K8. But perhaps one could get under 3 c/l. Ideas: 45 C 1) Use indexing to save the 3 LEA 46 C 2) Write reasonable feed-in code 47 C 3) Be more clever about register usage 48 C 4) Unroll more, handling CL negation, carry save/restore cost much now 49 C 5) Reschedule 50 51 C INPUT PARAMETERS 52 define(`rp', `%rdi') 53 define(`up', `%rsi') 54 define(`vp', `%rdx') 55 define(`n', `%rcx') 56 define(`cnt', `%r8') 57 58 ABI_SUPPORT(DOS64) 59 ABI_SUPPORT(STD64) 60 61 ASM_START() 62 TEXT 63 ALIGN(16) 64 PROLOGUE(mpn_lshsub_n) 65 FUNC_ENTRY(4) 66 IFDOS(` mov 56(%rsp), %r8d ') 67 68 push %r12 69 push %r13 70 push %r14 71 push %r15 72 push %rbx 73 74 mov n, %rax 75 xor R32(%rbx), R32(%rbx) C clear carry save register 76 mov R32(%r8), R32(%rcx) C shift count 77 xor R32(%r15), R32(%r15) C limb carry 78 79 mov R32(%rax), R32(%r11) 80 and $3, R32(%r11) 81 je L(4) 82 sub $1, R32(%r11) 83 84 L(oopette): 85 add R32(%rbx), R32(%rbx) C restore carry flag 86 mov 0(up), %r8 87 lea 8(up), up 88 sbb 0(vp), %r8 89 mov %r8, %r12 90 sbb R32(%rbx), R32(%rbx) C save carry flag 91 shl R8(%rcx), %r8 92 or %r15, %r8 93 mov %r12, %r15 94 lea 8(vp), vp 95 neg R8(%rcx) 96 shr R8(%rcx), %r15 97 neg R8(%rcx) 98 mov %r8, 0(rp) 99 lea 8(rp), rp 100 sub $1, R32(%r11) 101 jnc L(oopette) 102 103 L(4): 104 sub $4, %rax 105 jc L(end) 106 107 ALIGN(16) 108 L(oop): 109 add R32(%rbx), R32(%rbx) C restore carry flag 110 111 mov 0(up), %r8 112 mov 8(up), %r9 113 mov 16(up), %r10 114 mov 24(up), %r11 115 116 lea 32(up), up 117 118 sbb 0(vp), %r8 119 mov %r8, %r12 120 sbb 8(vp), %r9 121 mov %r9, %r13 122 sbb 16(vp), %r10 123 mov %r10, %r14 124 sbb 24(vp), %r11 125 126 sbb R32(%rbx), R32(%rbx) C save carry flag 127 128 shl R8(%rcx), %r8 129 shl R8(%rcx), %r9 130 shl R8(%rcx), %r10 131 or %r15, %r8 132 mov %r11, %r15 133 shl R8(%rcx), %r11 134 135 lea 32(vp), vp 136 137 neg R8(%rcx) 138 139 shr R8(%rcx), %r12 140 shr R8(%rcx), %r13 141 shr R8(%rcx), %r14 142 shr R8(%rcx), %r15 C used next loop 143 144 or %r12, %r9 145 or %r13, %r10 146 or %r14, %r11 147 148 neg R8(%rcx) 149 150 mov %r8, 0(rp) 151 mov %r9, 8(rp) 152 mov %r10, 16(rp) 153 mov %r11, 24(rp) 154 155 lea 32(rp), rp 156 157 sub $4, %rax 158 jnc L(oop) 159 L(end): 160 neg R32(%rbx) 161 shl R8(%rcx), %rbx 162 adc %r15, %rbx 163 mov %rbx, %rax 164 pop %rbx 165 pop %r15 166 pop %r14 167 pop %r13 168 pop %r12 169 170 FUNC_EXIT() 171 ret 172 EPILOGUE()