github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mode1o.asm (about) 1 dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder. 2 3 dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C Athlon: 11.0 36 C Hammer: 7.0 37 38 39 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 40 C mp_limb_t divisor); 41 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 42 C mp_limb_t divisor, mp_limb_t carry); 43 C 44 C With the loop running at just 11 cycles it doesn't seem worth bothering to 45 C check for high<divisor to save one step. 46 C 47 C Using a divl for size==1 measures slower than the modexact method, which 48 C is not too surprising since for the latter it's only about 24 cycles to 49 C calculate the modular inverse. 50 51 defframe(PARAM_CARRY, 16) 52 defframe(PARAM_DIVISOR,12) 53 defframe(PARAM_SIZE, 8) 54 defframe(PARAM_SRC, 4) 55 56 defframe(SAVE_EBX, -4) 57 defframe(SAVE_ESI, -8) 58 defframe(SAVE_EDI, -12) 59 defframe(SAVE_EBP, -16) 60 61 deflit(STACK_SPACE, 16) 62 63 TEXT 64 65 ALIGN(16) 66 PROLOGUE(mpn_modexact_1c_odd) 67 deflit(`FRAME',0) 68 69 movl PARAM_CARRY, %ecx 70 jmp L(start_1c) 71 72 EPILOGUE() 73 74 75 ALIGN(16) 76 PROLOGUE(mpn_modexact_1_odd) 77 deflit(`FRAME',0) 78 79 xorl %ecx, %ecx 80 L(start_1c): 81 movl PARAM_DIVISOR, %eax 82 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 83 84 movl %esi, SAVE_ESI 85 movl PARAM_DIVISOR, %esi 86 87 movl %edi, SAVE_EDI 88 89 shrl %eax C d/2 90 91 andl $127, %eax 92 93 ifdef(`PIC',` 94 LEA( binvert_limb_table, %edi) 95 movzbl (%eax,%edi), %edi C inv 8 bits 96 ',` 97 movzbl binvert_limb_table(%eax), %edi C inv 8 bits 98 ') 99 100 xorl %edx, %edx C initial extra carry 101 leal (%edi,%edi), %eax C 2*inv 102 103 imull %edi, %edi C inv*inv 104 105 movl %ebp, SAVE_EBP 106 movl PARAM_SIZE, %ebp 107 108 movl %ebx, SAVE_EBX 109 movl PARAM_SRC, %ebx 110 111 imull %esi, %edi C inv*inv*d 112 113 subl %edi, %eax C inv = 2*inv - inv*inv*d 114 leal (%eax,%eax), %edi C 2*inv 115 116 imull %eax, %eax C inv*inv 117 118 imull %esi, %eax C inv*inv*d 119 120 leal (%ebx,%ebp,4), %ebx C src end 121 negl %ebp C -size 122 123 subl %eax, %edi C inv = 2*inv - inv*inv*d 124 125 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 126 movl %esi, %eax 127 imull %edi, %eax 128 cmpl $1, %eax') 129 130 131 C The dependent chain here is 132 C 133 C cycles 134 C subl %edx, %eax 1 135 C imull %edi, %eax 4 136 C mull %esi 6 (high limb) 137 C ---- 138 C total 11 139 C 140 C Out of order execution hides the load latency for the source data, so no 141 C special scheduling is required. 142 143 L(top): 144 C eax src limb 145 C ebx src end ptr 146 C ecx next carry bit, 0 or 1 (or initial carry param) 147 C edx carry limb, high of last product 148 C esi divisor 149 C edi inverse 150 C ebp counter, limbs, negative 151 152 movl (%ebx,%ebp,4), %eax 153 154 subl %ecx, %eax C apply carry bit 155 movl $0, %ecx 156 157 setc %cl C new carry bit 158 159 subl %edx, %eax C apply carry limb 160 adcl $0, %ecx 161 162 imull %edi, %eax 163 164 mull %esi 165 166 incl %ebp 167 jnz L(top) 168 169 170 movl SAVE_ESI, %esi 171 movl SAVE_EDI, %edi 172 leal (%ecx,%edx), %eax 173 174 movl SAVE_EBX, %ebx 175 movl SAVE_EBP, %ebp 176 addl $STACK_SPACE, %esp 177 178 ret 179 180 EPILOGUE() 181 ASM_END()