github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/mul_basecase.asm (about) 1 dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result 2 dnl in a third limb vector. 3 4 dnl Copyright 1996-2002 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 35 C cycles/crossproduct 36 C P5 15 37 C P6 7.5 38 C K6 12.5 39 C K7 5.5 40 C P4 24 41 42 43 C void mpn_mul_basecase (mp_ptr wp, 44 C mp_srcptr xp, mp_size_t xsize, 45 C mp_srcptr yp, mp_size_t ysize); 46 C 47 C This was written in a haste since the Pentium optimized code that was used 48 C for all x86 machines was slow for the Pentium II. This code would benefit 49 C from some cleanup. 50 C 51 C To shave off some percentage of the run-time, one should make 4 variants 52 C of the Louter loop, for the four different outcomes of un mod 4. That 53 C would avoid Loop0 altogether. Code expansion would be > 4-fold for that 54 C part of the function, but since it is not very large, that would be 55 C acceptable. 56 C 57 C The mul loop (at L(oopM)) might need some tweaking. It's current speed is 58 C unknown. 59 60 defframe(PARAM_YSIZE,20) 61 defframe(PARAM_YP, 16) 62 defframe(PARAM_XSIZE,12) 63 defframe(PARAM_XP, 8) 64 defframe(PARAM_WP, 4) 65 66 defframe(VAR_MULTIPLIER, -4) 67 defframe(VAR_COUNTER, -8) 68 deflit(VAR_STACK_SPACE, 8) 69 70 TEXT 71 ALIGN(8) 72 73 PROLOGUE(mpn_mul_basecase) 74 deflit(`FRAME',0) 75 76 subl $VAR_STACK_SPACE,%esp 77 pushl %esi 78 pushl %ebp 79 pushl %edi 80 deflit(`FRAME',eval(VAR_STACK_SPACE+12)) 81 82 movl PARAM_XP,%esi 83 movl PARAM_WP,%edi 84 movl PARAM_YP,%ebp 85 86 movl (%esi),%eax C load xp[0] 87 mull (%ebp) C multiply by yp[0] 88 movl %eax,(%edi) C store to wp[0] 89 movl PARAM_XSIZE,%ecx C xsize 90 decl %ecx C If xsize = 1, ysize = 1 too 91 jz L(done) 92 93 pushl %ebx 94 FRAME_pushl() 95 movl %edx,%ebx 96 97 leal 4(%esi),%esi 98 leal 4(%edi),%edi 99 100 L(oopM): 101 movl (%esi),%eax C load next limb at xp[j] 102 leal 4(%esi),%esi 103 mull (%ebp) 104 addl %ebx,%eax 105 movl %edx,%ebx 106 adcl $0,%ebx 107 movl %eax,(%edi) 108 leal 4(%edi),%edi 109 decl %ecx 110 jnz L(oopM) 111 112 movl %ebx,(%edi) C most significant limb of product 113 addl $4,%edi C increment wp 114 movl PARAM_XSIZE,%eax 115 shll $2,%eax 116 subl %eax,%edi 117 subl %eax,%esi 118 119 movl PARAM_YSIZE,%eax C ysize 120 decl %eax 121 jz L(skip) 122 movl %eax,VAR_COUNTER C set index i to ysize 123 124 L(outer): 125 movl PARAM_YP,%ebp C yp 126 addl $4,%ebp C make ebp point to next v limb 127 movl %ebp,PARAM_YP 128 movl (%ebp),%eax C copy y limb ... 129 movl %eax,VAR_MULTIPLIER C ... to stack slot 130 movl PARAM_XSIZE,%ecx 131 132 xorl %ebx,%ebx 133 andl $3,%ecx 134 jz L(end0) 135 136 L(oop0): 137 movl (%esi),%eax 138 mull VAR_MULTIPLIER 139 leal 4(%esi),%esi 140 addl %ebx,%eax 141 movl $0,%ebx 142 adcl %ebx,%edx 143 addl %eax,(%edi) 144 adcl %edx,%ebx C propagate carry into cylimb 145 146 leal 4(%edi),%edi 147 decl %ecx 148 jnz L(oop0) 149 150 L(end0): 151 movl PARAM_XSIZE,%ecx 152 shrl $2,%ecx 153 jz L(endX) 154 155 ALIGN(8) 156 L(oopX): 157 movl (%esi),%eax 158 mull VAR_MULTIPLIER 159 addl %eax,%ebx 160 movl $0,%ebp 161 adcl %edx,%ebp 162 163 movl 4(%esi),%eax 164 mull VAR_MULTIPLIER 165 addl %ebx,(%edi) 166 adcl %eax,%ebp C new lo + cylimb 167 movl $0,%ebx 168 adcl %edx,%ebx 169 170 movl 8(%esi),%eax 171 mull VAR_MULTIPLIER 172 addl %ebp,4(%edi) 173 adcl %eax,%ebx C new lo + cylimb 174 movl $0,%ebp 175 adcl %edx,%ebp 176 177 movl 12(%esi),%eax 178 mull VAR_MULTIPLIER 179 addl %ebx,8(%edi) 180 adcl %eax,%ebp C new lo + cylimb 181 movl $0,%ebx 182 adcl %edx,%ebx 183 184 addl %ebp,12(%edi) 185 adcl $0,%ebx C propagate carry into cylimb 186 187 leal 16(%esi),%esi 188 leal 16(%edi),%edi 189 decl %ecx 190 jnz L(oopX) 191 192 L(endX): 193 movl %ebx,(%edi) 194 addl $4,%edi 195 196 C we incremented wp and xp in the loop above; compensate 197 movl PARAM_XSIZE,%eax 198 shll $2,%eax 199 subl %eax,%edi 200 subl %eax,%esi 201 202 movl VAR_COUNTER,%eax 203 decl %eax 204 movl %eax,VAR_COUNTER 205 jnz L(outer) 206 207 L(skip): 208 popl %ebx 209 popl %edi 210 popl %ebp 211 popl %esi 212 addl $8,%esp 213 ret 214 215 L(done): 216 movl %edx,4(%edi) C store to wp[1] 217 popl %edi 218 popl %ebp 219 popl %esi 220 addl $8,%esp 221 ret 222 223 EPILOGUE()