github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/addlsh1_n.asm (about) 1 dnl AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) 2 3 dnl Copyright 2011 Free Software Foundation, Inc. 4 5 dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns. 36 C The innerloop is 2*3-way unrolled, which is best we can do with the available 37 C registers. It seems tricky to use the same structure for rsblsh1_n, since we 38 C cannot feed carry between operations there. 39 40 C cycles/limb 41 C P5 42 C P6 model 0-8,10-12 43 C P6 model 9 (Banias) 44 C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift) 45 C P4 model 0 (Willamette) 46 C P4 model 1 (?) 47 C P4 model 2 (Northwood) 48 C P4 model 3 (Prescott) 49 C P4 model 4 (Nocona) 50 C Intel Atom 6 51 C AMD K6 ? 52 C AMD K7 2.5 53 C AMD K8 54 55 C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32 56 C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately, 57 C that means we need an initial magic multiply. 58 C 59 C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We 60 C cannot do rsblsh1_n since we feed carry from the shift blocks to the 61 C add/subtract blocks, which is right for addition but reversed for 62 C subtraction. We could perhaps do sublsh1_n, with some extra move insns, 63 C without losing any time, since we're not issue limited but carry recurrency 64 C latency. 65 C 66 C Breaking carry recurrency might be a good idea. We would then need separate 67 C registers for the shift carry and add/subtract carry, which in turn would 68 C force is to 2*2-way unrolling. 69 70 defframe(PARAM_SIZE, 16) 71 defframe(PARAM_DBLD, 12) 72 defframe(PARAM_SRC, 8) 73 defframe(PARAM_DST, 4) 74 75 dnl re-use parameter space 76 define(VAR_COUNT,`PARAM_DST') 77 define(VAR_TMP,`PARAM_DBLD') 78 79 ASM_START() 80 TEXT 81 ALIGN(8) 82 PROLOGUE(mpn_addlsh1_n) 83 deflit(`FRAME',0) 84 85 define(`rp', `%edi') 86 define(`up', `%esi') 87 define(`vp', `%ebp') 88 89 mov $0x2aaaaaab, %eax 90 91 push %ebx FRAME_pushl() 92 mov PARAM_SIZE, %ebx C size 93 94 push rp FRAME_pushl() 95 mov PARAM_DST, rp 96 97 mul %ebx 98 99 push up FRAME_pushl() 100 mov PARAM_SRC, up 101 102 not %edx C count = -(size\8)-1 103 mov %edx, VAR_COUNT 104 105 push vp FRAME_pushl() 106 mov PARAM_DBLD, vp 107 108 lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3 109 xor %edx, %edx 110 lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6 111 or %ebx, %ebx 112 jz L(exact) 113 114 L(oop): 115 ifdef(`CPU_P6',` 116 shr %edx ') C restore 2nd saved carry bit 117 mov (vp), %eax 118 adc %eax, %eax 119 rcr %edx C restore 1st saved carry bit 120 lea 4(vp), vp 121 adc (up), %eax 122 lea 4(up), up 123 adc %edx, %edx C save a carry bit in edx 124 ifdef(`CPU_P6',` 125 adc %edx, %edx ') C save another carry bit in edx 126 dec %ebx 127 mov %eax, (rp) 128 lea 4(rp), rp 129 jnz L(oop) 130 mov vp, VAR_TMP 131 L(exact): 132 incl VAR_COUNT 133 jz L(end) 134 135 ALIGN(16) 136 L(top): 137 ifdef(`CPU_P6',` 138 shr %edx ') C restore 2nd saved carry bit 139 mov (vp), %eax 140 adc %eax, %eax 141 mov 4(vp), %ebx 142 adc %ebx, %ebx 143 mov 8(vp), %ecx 144 adc %ecx, %ecx 145 146 rcr %edx C restore 1st saved carry bit 147 148 adc (up), %eax 149 mov %eax, (rp) 150 adc 4(up), %ebx 151 mov %ebx, 4(rp) 152 adc 8(up), %ecx 153 mov %ecx, 8(rp) 154 155 mov 12(vp), %eax 156 adc %eax, %eax 157 mov 16(vp), %ebx 158 adc %ebx, %ebx 159 mov 20(vp), %ecx 160 adc %ecx, %ecx 161 162 lea 24(vp), vp 163 adc %edx, %edx C save a carry bit in edx 164 165 adc 12(up), %eax 166 mov %eax, 12(rp) 167 adc 16(up), %ebx 168 mov %ebx, 16(rp) 169 adc 20(up), %ecx 170 171 lea 24(up), up 172 173 ifdef(`CPU_P6',` 174 adc %edx, %edx ') C save another carry bit in edx 175 mov %ecx, 20(rp) 176 incl VAR_COUNT 177 lea 24(rp), rp 178 jne L(top) 179 180 L(end): 181 pop vp FRAME_popl() 182 pop up FRAME_popl() 183 184 ifdef(`CPU_P6',` 185 xor %eax, %eax 186 shr $1, %edx 187 adc %edx, %eax 188 ',` 189 adc $0, %edx 190 mov %edx, %eax 191 ') 192 pop rp FRAME_popl() 193 pop %ebx FRAME_popl() 194 ret 195 EPILOGUE() 196 ASM_END()