github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mode1o.asm (about) 1 dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder. 2 3 dnl Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C AMD K8,K9 10 36 C AMD K10 10 37 C Intel P4 33 38 C Intel core2 13 39 C Intel corei 14.5 40 C Intel atom 35 41 C VIA nano ? 42 43 44 C The dependent chain in the main loop is 45 C 46 C cycles 47 C sub %rdx, %rax 1 48 C imul %r9, %rax 4 49 C mul %r8 5 50 C ---- 51 C total 10 52 C 53 C The mov load from src seems to need to be scheduled back before the jz to 54 C achieve this speed, out-of-order execution apparently can't completely hide 55 C the latency otherwise. 56 C 57 C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it 58 C for the first iteration (where there's no cbit). 59 C 60 C The code alignment used (32-byte) for the loop also seems necessary. Without 61 C that the non-PIC case has adc crossing the 0x60 offset, apparently making it 62 C run at 11 cycles instead of 10. 63 64 65 ABI_SUPPORT(DOS64) 66 ABI_SUPPORT(STD64) 67 68 ASM_START() 69 TEXT 70 ALIGN(32) 71 PROLOGUE(mpn_modexact_1_odd) 72 FUNC_ENTRY(3) 73 mov $0, R32(%rcx) 74 IFDOS(` jmp L(ent) ') 75 76 PROLOGUE(mpn_modexact_1c_odd) 77 FUNC_ENTRY(4) 78 L(ent): 79 C rdi src 80 C rsi size 81 C rdx divisor 82 C rcx carry 83 84 mov %rdx, %r8 C d 85 shr R32(%rdx) C d/2 86 87 LEA( binvert_limb_table, %r9) 88 89 and $127, R32(%rdx) 90 mov %rcx, %r10 C initial carry 91 92 movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits 93 94 mov (%rdi), %rax C src[0] 95 lea (%rdi,%rsi,8), %r11 C src end 96 mov %r8, %rdi C d, made available to imull 97 98 lea (%rdx,%rdx), R32(%rcx) C 2*inv 99 imul R32(%rdx), R32(%rdx) C inv*inv 100 101 neg %rsi C -size 102 103 imul R32(%rdi), R32(%rdx) C inv*inv*d 104 105 sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits 106 107 lea (%rcx,%rcx), R32(%rdx) C 2*inv 108 imul R32(%rcx), R32(%rcx) C inv*inv 109 110 imul R32(%rdi), R32(%rcx) C inv*inv*d 111 112 sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits 113 xor R32(%rcx), R32(%rcx) C initial cbit 114 115 lea (%rdx,%rdx), %r9 C 2*inv 116 imul %rdx, %rdx C inv*inv 117 118 imul %r8, %rdx C inv*inv*d 119 120 sub %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits 121 mov %r10, %rdx C initial climb 122 123 ASSERT(e,` C d*inv == 1 mod 2^64 124 mov %r8, %r10 125 imul %r9, %r10 126 cmp $1, %r10') 127 128 inc %rsi 129 jz L(one) 130 131 132 ALIGN(16) 133 L(top): 134 C rax l = src[i]-cbit 135 C rcx new cbit, 0 or 1 136 C rdx climb, high of last product 137 C rsi counter, limbs, negative 138 C rdi 139 C r8 divisor 140 C r9 inverse 141 C r11 src end ptr 142 143 sub %rdx, %rax C l = src[i]-cbit - climb 144 145 adc $0, %rcx C more cbit 146 imul %r9, %rax C q = l * inverse 147 148 mul %r8 C climb = high (q * d) 149 150 mov (%r11,%rsi,8), %rax C src[i+1] 151 sub %rcx, %rax C next l = src[i+1] - cbit 152 setc R8(%rcx) C new cbit 153 154 inc %rsi 155 jnz L(top) 156 157 158 L(one): 159 sub %rdx, %rax C l = src[i]-cbit - climb 160 161 adc $0, %rcx C more cbit 162 imul %r9, %rax C q = l * inverse 163 164 mul %r8 C climb = high (q * d) 165 166 lea (%rcx,%rdx), %rax C climb+cbit 167 FUNC_EXIT() 168 ret 169 170 EPILOGUE(mpn_modexact_1c_odd) 171 EPILOGUE(mpn_modexact_1_odd)