github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mod_34lsub1.asm (about) 1 dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1. 2 3 dnl Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C Athlon: 1 36 C Hammer: 1 37 38 39 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) 40 C 41 C The loop form below and the 64 byte code alignment seem necessary for the 42 C claimed speed. This is a bit strange, since normally k7 isn't very 43 C sensitive to such things. Perhaps there has to be 6 instructions in the 44 C first 16 bytes for the BTB entry or something. 45 46 defframe(PARAM_SIZE, 8) 47 defframe(PARAM_SRC, 4) 48 49 dnl re-use parameter space 50 define(SAVE_EDI, `PARAM_SIZE') 51 52 TEXT 53 ALIGN(64) 54 PROLOGUE(mpn_mod_34lsub1) 55 deflit(`FRAME',0) 56 57 movl PARAM_SIZE, %ecx 58 movl PARAM_SRC, %edx 59 60 subl $2, %ecx 61 ja L(three_or_more) 62 63 movl (%edx), %eax 64 jb L(one) 65 66 movl 4(%edx), %ecx 67 movl %eax, %edx 68 shrl $24, %eax C src[0] low 69 70 andl $0xFFFFFF, %edx C src[0] high 71 addl %edx, %eax 72 movl %ecx, %edx 73 74 andl $0xFFFF, %ecx 75 shrl $16, %edx C src[1] high 76 addl %edx, %eax 77 78 shll $8, %ecx C src[1] low 79 addl %ecx, %eax 80 81 L(one): 82 ret 83 84 85 L(three_or_more): 86 C eax 87 C ebx 88 C ecx size-2 89 C edx src 90 C esi 91 C edi 92 93 pushl %ebx FRAME_pushl() 94 xorl %eax, %eax 95 xorl %ebx, %ebx 96 97 movl %edi, SAVE_EDI 98 pushl %esi FRAME_pushl() 99 xorl %esi, %esi C and clear carry flag 100 101 102 C code offset 0x40 at this point 103 L(top): 104 C eax acc 0mod3 105 C ebx acc 1mod3 106 C ecx counter, limbs 107 C edx src 108 C esi acc 2mod3 109 C edi 110 111 leal 24(%edx), %edx 112 leal -2(%ecx), %ecx 113 adcl -24(%edx), %eax 114 adcl -20(%edx), %ebx 115 adcl -16(%edx), %esi 116 117 decl %ecx 118 jng L(done_loop) 119 120 leal -2(%ecx), %ecx 121 adcl -12(%edx), %eax 122 adcl -8(%edx), %ebx 123 adcl -4(%edx), %esi 124 125 decl %ecx 126 jg L(top) 127 128 129 leal 12(%edx), %edx 130 131 132 L(done_loop): 133 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively 134 135 incl %ecx 136 movl $0xFFFFFFFF, %edi 137 js L(combine) 138 139 adcl -12(%edx), %eax 140 decl %ecx 141 movl $0xFFFFFF00, %edi 142 js L(combine) 143 144 adcl -8(%edx), %ebx 145 movl $0xFFFF0000, %edi 146 147 148 L(combine): 149 C eax acc 0mod3 150 C ebx acc 1mod3 151 C ecx 152 C edx 153 C esi acc 2mod3 154 C edi mask 155 156 sbbl %ecx, %ecx C carry 157 movl %eax, %edx C 0mod3 158 shrl $24, %eax C 0mod3 high 159 160 andl %edi, %ecx C carry masked 161 andl $0x00FFFFFF, %edx C 0mod3 low 162 movl %ebx, %edi C 1mod3 163 164 subl %ecx, %eax C apply carry 165 shrl $16, %ebx C 1mod3 high 166 andl $0xFFFF, %edi 167 168 addl %edx, %eax C apply 0mod3 low 169 movl %esi, %edx C 2mod3 170 shll $8, %edi C 1mod3 low 171 172 addl %ebx, %eax C apply 1mod3 high 173 shrl $8, %esi C 2mod3 high 174 movzbl %dl, %edx C 2mod3 low 175 176 addl %edi, %eax C apply 1mod3 low 177 shll $16, %edx C 2mod3 low 178 179 addl %esi, %eax C apply 2mod3 high 180 popl %esi FRAME_popl() 181 182 movl SAVE_EDI, %edi 183 addl %edx, %eax C apply 2mod3 low 184 popl %ebx FRAME_popl() 185 186 ret 187 188 EPILOGUE()