github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/sublsh1_n.asm (about) 1 dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1) 2 3 dnl Copyright 2011 Free Software Foundation, Inc. 4 5 dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns. The 36 C innerloop is 2*3-way unrolled, which is best we can do with the available 37 C registers. It seems tricky to use the same structure for rsblsh1_n, since we 38 C cannot feed carry between operations there. 39 40 C cycles/limb 41 C P5 42 C P6 model 0-8,10-12 43 C P6 model 9 (Banias) 44 C P6 model 13 (Dothan) 45 C P4 model 0 (Willamette) 46 C P4 model 1 (?) 47 C P4 model 2 (Northwood) 48 C P4 model 3 (Prescott) 49 C P4 model 4 (Nocona) 50 C Intel Atom 6.75 51 C AMD K6 52 C AMD K7 53 C AMD K8 54 55 C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32 56 C processors. It uses 2*4-way unrolling, for good reasons. 57 C 58 C Breaking carry recurrency might be a good idea. We would then need separate 59 C registers for the shift carry and add/subtract carry, which in turn would 60 C force is to 2*2-way unrolling. 61 62 defframe(PARAM_SIZE, 12) 63 defframe(PARAM_SRC, 8) 64 defframe(PARAM_DST, 4) 65 66 dnl re-use parameter space 67 define(VAR_COUNT,`PARAM_SIZE') 68 define(SAVE_EBX,`PARAM_SRC') 69 define(SAVE_EBP,`PARAM_DST') 70 71 ASM_START() 72 TEXT 73 ALIGN(8) 74 PROLOGUE(mpn_sublsh1_n_ip1) 75 deflit(`FRAME',0) 76 77 define(`rp', `%edi') 78 define(`up', `%esi') 79 80 mov PARAM_SIZE, %eax C size 81 push up FRAME_pushl() 82 push rp FRAME_pushl() 83 xor %edx, %edx 84 mov PARAM_SRC, up 85 mov PARAM_DST, rp 86 mov %ebx, SAVE_EBX 87 mov %eax, %ebx 88 shr $3, %eax 89 90 not %eax C count = -(size\8)-i 91 and $7, %ebx C size % 8 92 jz L(exact) 93 94 L(oop): 95 ifdef(`CPU_P6',` 96 shr %edx ') C restore 2nd saved carry bit 97 mov (up), %ecx 98 adc %ecx, %ecx 99 rcr %edx C restore 1st saved carry bit 100 lea 4(up), up 101 sbb %ecx, (rp) 102 lea 4(rp), rp 103 adc %edx, %edx C save a carry bit in edx 104 ifdef(`CPU_P6',` 105 adc %edx, %edx ') C save another carry bit in edx 106 dec %ebx 107 jnz L(oop) 108 L(exact): 109 inc %eax 110 jz L(end) 111 mov %eax, VAR_COUNT 112 mov %ebp, SAVE_EBP 113 114 ALIGN(16) 115 L(top): 116 ifdef(`CPU_P6',` 117 shr %edx ') C restore 2nd saved carry bit 118 mov (up), %eax 119 adc %eax, %eax 120 mov 4(up), %ebx 121 adc %ebx, %ebx 122 mov 8(up), %ecx 123 adc %ecx, %ecx 124 mov 12(up), %ebp 125 adc %ebp, %ebp 126 127 rcr %edx C restore 1st saved carry bit 128 129 sbb %eax, (rp) 130 sbb %ebx, 4(rp) 131 sbb %ecx, 8(rp) 132 sbb %ebp, 12(rp) 133 134 mov 16(up), %eax 135 adc %eax, %eax 136 mov 20(up), %ebx 137 adc %ebx, %ebx 138 mov 24(up), %ecx 139 adc %ecx, %ecx 140 mov 28(up), %ebp 141 adc %ebp, %ebp 142 143 lea 32(up), up 144 adc %edx, %edx C save a carry bit in edx 145 146 sbb %eax, 16(rp) 147 sbb %ebx, 20(rp) 148 sbb %ecx, 24(rp) 149 sbb %ebp, 28(rp) 150 151 ifdef(`CPU_P6',` 152 adc %edx, %edx ') C save another carry bit in edx 153 incl VAR_COUNT 154 lea 32(rp), rp 155 jne L(top) 156 157 mov SAVE_EBP, %ebp 158 L(end): 159 mov SAVE_EBX, %ebx 160 161 ifdef(`CPU_P6',` 162 xor %eax, %eax 163 shr $1, %edx 164 adc %edx, %eax 165 ',` 166 adc $0, %edx 167 mov %edx, %eax 168 ') 169 pop rp FRAME_popl() 170 pop up FRAME_popl() 171 ret 172 EPILOGUE() 173 ASM_END()