github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/pre_mod_1.asm (about) 1 dnl AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor. 2 3 dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K6: 18.0 cycles/limb 35 36 37 C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, 38 C mp_limb_t inverse); 39 C 40 C This code is only 2 c/l faster than a simple divl, but that's 10% so it's 41 C considered worthwhile (just). 42 43 defframe(PARAM_INVERSE,16) 44 defframe(PARAM_DIVISOR,12) 45 defframe(PARAM_SIZE, 8) 46 defframe(PARAM_SRC, 4) 47 48 TEXT 49 ALIGN(32) 50 PROLOGUE(mpn_preinv_mod_1) 51 deflit(`FRAME',0) 52 53 ASSERT(ae,`cmpl $1, PARAM_SIZE') 54 ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR') 55 56 movl PARAM_SIZE, %ecx 57 pushl %ebp FRAME_pushl() 58 59 movl PARAM_SRC, %ebp 60 pushl %edi FRAME_pushl() 61 62 movl PARAM_DIVISOR, %eax 63 pushl %esi FRAME_pushl() 64 65 movl -4(%ebp,%ecx,4), %esi C src high limb 66 pushl %ebx FRAME_pushl() 67 68 movl %edx, %edi C first n2 to cancel 69 subl %eax, %esi C first n1 = high-divisor 70 71 decl %ecx 72 jz L(done_sbbl) 73 74 L(top): 75 C eax scratch 76 C ebx n10, nadj, q1 77 C ecx counter, size to 1 78 C edx scratch 79 C esi n2 80 C edi old high, for underflow test 81 C ebp src 82 83 sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1 84 85 L(entry): 86 andl PARAM_DIVISOR, %edi 87 L(q1_ff_top): 88 movl -4(%ebp,%ecx,4), %ebx 89 90 addl %esi, %edi C possible addback 91 movl %ebx, %esi C n10 92 93 sarl $31, %ebx C -n1 = 0 or -1 94 movl %edi, %eax C n2 95 96 movl PARAM_INVERSE, %edx 97 subl %ebx, %eax C n2+n1 98 99 mull %edx C m*(n2+n1) 100 101 andl PARAM_DIVISOR, %ebx C -n1 & d 102 addl %esi, %ebx C nadj = n10 + (-n1&d), ignoring overflow 103 104 addl %ebx, %eax C low m*(n2+n1) + nadj, giving carry flag 105 leal 1(%edi), %ebx C n2+1 106 107 adcl %ebx, %edx C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1 108 109 movl PARAM_DIVISOR, %eax C d 110 jz L(q1_ff) 111 112 mull %edx C (q1+1)*d 113 114 subl %eax, %esi C low n-(q1+1)*d 115 loop L(top) 116 117 118 119 L(done_sbbl): 120 sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1 121 122 andl PARAM_DIVISOR, %edi 123 L(done_esi_edi): 124 popl %ebx 125 126 leal (%esi,%edi), %eax 127 popl %esi 128 129 popl %edi 130 popl %ebp 131 132 ret 133 134 135 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword 136 C of q*d is simply -d and the remainder n-q*d = n10+d. This is rarely 137 C reached. 138 139 L(q1_ff): 140 movl PARAM_DIVISOR, %edi 141 loop L(q1_ff_top) 142 143 jmp L(done_esi_edi) 144 145 146 EPILOGUE()