github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_34lsub1.asm (about) 1 dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. 2 3 dnl Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation, 4 dnl Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 35 C cycles/limb 36 C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way 37 C AMD K10 0.67 this seems hard to beat 38 C AMD bd1 1 39 C AMD bobcat 1.07 40 C Intel P4 7.35 terrible, use old code 41 C Intel core2 1.25 1+epsilon with huge unrolling 42 C Intel NHM 1.15 this seems hard to beat 43 C Intel SBR 0.93 44 C Intel atom 2.5 45 C VIA nano 1.25 this seems hard to beat 46 47 C INPUT PARAMETERS 48 define(`ap', %rdi) 49 define(`n', %rsi) 50 51 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) 52 53 C TODO 54 C * Review feed-in and wind-down code. 55 56 ABI_SUPPORT(DOS64) 57 ABI_SUPPORT(STD64) 58 59 ASM_START() 60 TEXT 61 ALIGN(32) 62 PROLOGUE(mpn_mod_34lsub1) 63 FUNC_ENTRY(2) 64 65 mov $0x0000FFFFFFFFFFFF, %r11 66 67 mov (ap), %rax 68 69 cmp $2, %rsi 70 ja L(gt2) 71 72 jb L(one) 73 74 mov 8(ap), %rsi 75 mov %rax, %rdx 76 shr $48, %rax C src[0] low 77 78 and %r11, %rdx C src[0] high 79 add %rdx, %rax 80 mov R32(%rsi), R32(%rdx) 81 82 shr $32, %rsi C src[1] high 83 add %rsi, %rax 84 85 shl $16, %rdx C src[1] low 86 add %rdx, %rax 87 L(one): FUNC_EXIT() 88 ret 89 90 91 C Don't change this, the wind-down code is not able to handle greater values 92 define(UNROLL,3) 93 94 L(gt2): mov 8(ap), %rcx 95 mov 16(ap), %rdx 96 xor %r9, %r9 97 add $24, ap 98 sub $eval(UNROLL*3+3), %rsi 99 jc L(end) 100 ALIGN(16) 101 L(top): 102 add (ap), %rax 103 adc 8(ap), %rcx 104 adc 16(ap), %rdx 105 adc $0, %r9 106 forloop(i,1,UNROLL-1,`dnl 107 add eval(i*24)(ap), %rax 108 adc eval(i*24+8)(ap), %rcx 109 adc eval(i*24+16)(ap), %rdx 110 adc $0, %r9 111 ')dnl 112 add $eval(UNROLL*24), ap 113 sub $eval(UNROLL*3), %rsi 114 jnc L(top) 115 116 L(end): 117 lea L(tab)(%rip), %r8 118 ifdef(`PIC', 119 ` movslq 36(%r8,%rsi,4), %r10 120 add %r10, %r8 121 jmp *%r8 122 ',` 123 jmp *72(%r8,%rsi,8) 124 ') 125 JUMPTABSECT 126 ALIGN(8) 127 L(tab): JMPENT( L(0), L(tab)) 128 JMPENT( L(1), L(tab)) 129 JMPENT( L(2), L(tab)) 130 JMPENT( L(3), L(tab)) 131 JMPENT( L(4), L(tab)) 132 JMPENT( L(5), L(tab)) 133 JMPENT( L(6), L(tab)) 134 JMPENT( L(7), L(tab)) 135 JMPENT( L(8), L(tab)) 136 TEXT 137 138 L(6): add (ap), %rax 139 adc 8(ap), %rcx 140 adc 16(ap), %rdx 141 adc $0, %r9 142 add $24, ap 143 L(3): add (ap), %rax 144 adc 8(ap), %rcx 145 adc 16(ap), %rdx 146 jmp L(cj1) 147 148 L(7): add (ap), %rax 149 adc 8(ap), %rcx 150 adc 16(ap), %rdx 151 adc $0, %r9 152 add $24, ap 153 L(4): add (ap), %rax 154 adc 8(ap), %rcx 155 adc 16(ap), %rdx 156 adc $0, %r9 157 add $24, ap 158 L(1): add (ap), %rax 159 adc $0, %rcx 160 jmp L(cj2) 161 162 L(8): add (ap), %rax 163 adc 8(ap), %rcx 164 adc 16(ap), %rdx 165 adc $0, %r9 166 add $24, ap 167 L(5): add (ap), %rax 168 adc 8(ap), %rcx 169 adc 16(ap), %rdx 170 adc $0, %r9 171 add $24, ap 172 L(2): add (ap), %rax 173 adc 8(ap), %rcx 174 175 L(cj2): adc $0, %rdx 176 L(cj1): adc $0, %r9 177 L(0): add %r9, %rax 178 adc $0, %rcx 179 adc $0, %rdx 180 adc $0, %rax 181 182 mov %rax, %rdi C 0mod3 183 shr $48, %rax C 0mod3 high 184 185 and %r11, %rdi C 0mod3 low 186 mov R32(%rcx), R32(%r10) C 1mod3 187 188 shr $32, %rcx C 1mod3 high 189 190 add %rdi, %rax C apply 0mod3 low 191 movzwl %dx, R32(%rdi) C 2mod3 192 shl $16, %r10 C 1mod3 low 193 194 add %rcx, %rax C apply 1mod3 high 195 shr $16, %rdx C 2mod3 high 196 197 add %r10, %rax C apply 1mod3 low 198 shl $32, %rdi C 2mod3 low 199 200 add %rdx, %rax C apply 2mod3 high 201 add %rdi, %rax C apply 2mod3 low 202 203 FUNC_EXIT() 204 ret 205 EPILOGUE()