github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mode1o.asm (about) 1 dnl AMD K6 mpn_modexact_1_odd -- exact division style remainder. 2 3 dnl Copyright 2000-2003, 2007 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C K6: 10.0 cycles/limb 35 36 37 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 38 C mp_limb_t divisor); 39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 40 C mp_limb_t divisor, mp_limb_t carry); 41 C 42 C A special case for high<divisor at the end measured only about 4 cycles 43 C faster, and so isn't used. 44 C 45 C A special case for size==1 using a divl rather than the inverse measured 46 C only about 5 cycles faster, and so isn't used. When size==1 and 47 C high<divisor it can skip a division and be a full 24 cycles faster, but 48 C this isn't an important case. 49 50 defframe(PARAM_CARRY, 16) 51 defframe(PARAM_DIVISOR,12) 52 defframe(PARAM_SIZE, 8) 53 defframe(PARAM_SRC, 4) 54 55 TEXT 56 57 ALIGN(32) 58 PROLOGUE(mpn_modexact_1c_odd) 59 deflit(`FRAME',0) 60 61 movl PARAM_DIVISOR, %ecx 62 pushl %esi FRAME_pushl() 63 64 movl PARAM_CARRY, %edx 65 jmp L(start_1c) 66 67 EPILOGUE() 68 69 70 ALIGN(16) 71 PROLOGUE(mpn_modexact_1_odd) 72 deflit(`FRAME',0) 73 74 movl PARAM_DIVISOR, %ecx 75 pushl %esi FRAME_pushl() 76 77 xorl %edx, %edx 78 L(start_1c): 79 pushl %edi FRAME_pushl() 80 81 shrl %ecx C d/2 82 movl PARAM_DIVISOR, %esi 83 84 andl $127, %ecx C d/2, 7 bits 85 pushl %ebp FRAME_pushl() 86 87 ifdef(`PIC',` 88 LEA( binvert_limb_table, %edi) 89 Zdisp( movzbl, 0,(%ecx,%edi), %edi) C inv 8 bits 90 ',` 91 movzbl binvert_limb_table(%ecx), %edi C inv 8 bits 92 ') 93 leal (%edi,%edi), %ecx C 2*inv 94 95 imull %edi, %edi C inv*inv 96 97 movl PARAM_SRC, %eax 98 movl PARAM_SIZE, %ebp 99 100 imull %esi, %edi C inv*inv*d 101 102 pushl %ebx FRAME_pushl() 103 leal (%eax,%ebp,4), %ebx C src end 104 105 subl %edi, %ecx C inv = 2*inv - inv*inv*d 106 leal (%ecx,%ecx), %edi C 2*inv 107 108 imull %ecx, %ecx C inv*inv 109 110 movl (%eax), %eax C src low limb 111 negl %ebp C -size 112 113 imull %esi, %ecx C inv*inv*d 114 115 subl %ecx, %edi C inv = 2*inv - inv*inv*d 116 117 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 118 pushl %eax 119 movl %esi, %eax 120 imull %edi, %eax 121 cmpl $1, %eax 122 popl %eax') 123 124 jmp L(entry) 125 126 127 C Rotating the mul to the top of the loop saves 1 cycle, presumably by 128 C hiding the loop control under the imul latency. 129 C 130 C The run time is 10 cycles, but decoding is only 9 (and the dependent chain 131 C only 8). It's not clear how to get down to 9 cycles. 132 C 133 C The xor and rcl to handle the carry bit could be an sbb instead, with the 134 C the carry bit add becoming a sub, but that doesn't save anything. 135 136 L(top): 137 C eax (low product) 138 C ebx src end 139 C ecx carry bit, 0 or 1 140 C edx (high product, being carry limb) 141 C esi divisor 142 C edi inverse 143 C ebp counter, limbs, negative 144 145 mull %esi 146 147 movl (%ebx,%ebp,4), %eax 148 addl %ecx, %edx C apply carry bit to carry limb 149 150 L(entry): 151 xorl %ecx, %ecx 152 subl %edx, %eax C apply carry limb 153 154 rcll %ecx 155 156 imull %edi, %eax 157 158 incl %ebp 159 jnz L(top) 160 161 162 163 popl %ebx 164 popl %ebp 165 166 mull %esi 167 168 popl %edi 169 popl %esi 170 171 leal (%ecx,%edx), %eax 172 173 ret 174 175 EPILOGUE() 176 ASM_END()