github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/submul_1.asm (about) 1 dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and 2 dnl subtract the result from a second limb vector. 3 4 dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 35 C cycles/limb 36 C P6 model 0-8,10-12 - 37 C P6 model 9 (Banias) 6.8 38 C P6 model 13 (Dothan) 6.9 39 C P4 model 0-1 (Willamette) ? 40 C P4 model 2 (Northwood) 5.87 41 C P4 model 3-4 (Prescott) 6.5 42 43 C This code represents a step forwards compared to the code available before 44 C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is 45 C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and 46 C Prescott compared to the old code. 47 C 48 C The arrangements made here to get a two instruction dependent chain are 49 C slightly subtle. In the loop the carry (or borrow rather) is a negative so 50 C that a paddq can be used to give a low limb ready to store, and a high limb 51 C ready to become the new carry after a psrlq. 52 C 53 C If the carry was a simple twos complement negative then the psrlq shift would 54 C need to bring in 0 bits or 1 bits according to whether the high was zero or 55 C non-zero, since a non-zero value would represent a negative needing sign 56 C extension. That wouldn't be particularly easy to arrange and certainly would 57 C add an instruction to the dependent chain, so instead an offset is applied so 58 C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to 59 C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore 60 C always positive and can always have 0 bits shifted in, which is what psrlq 61 C does. 62 C 63 C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be 64 C done off the dependent chain. The total adjustment then is to add 65 C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF 66 C to remove the offset from the current carry, for a net add of 67 C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when 68 C fetched. 69 C 70 C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement 71 C negative, which is how it's undone for the return value, but that doesn't 72 C seem as clear. 73 74 defframe(PARAM_CARRY, 20) 75 defframe(PARAM_MULTIPLIER,16) 76 defframe(PARAM_SIZE, 12) 77 defframe(PARAM_SRC, 8) 78 defframe(PARAM_DST, 4) 79 80 TEXT 81 ALIGN(16) 82 83 PROLOGUE(mpn_submul_1c) 84 deflit(`FRAME',0) 85 movd PARAM_CARRY, %mm1 86 jmp L(start_1c) 87 EPILOGUE() 88 89 PROLOGUE(mpn_submul_1) 90 deflit(`FRAME',0) 91 pxor %mm1, %mm1 C initial borrow 92 93 L(start_1c): 94 mov PARAM_SRC, %eax 95 pcmpeqd %mm0, %mm0 96 97 movd PARAM_MULTIPLIER, %mm7 98 pcmpeqd %mm6, %mm6 99 100 mov PARAM_DST, %edx 101 psrlq $32, %mm0 C 0x00000000FFFFFFFF 102 103 mov PARAM_SIZE, %ecx 104 psllq $32, %mm6 C 0xFFFFFFFF00000000 105 106 psubq %mm0, %mm6 C 0xFFFFFFFE00000001 107 108 psubq %mm1, %mm0 C 0xFFFFFFFF - borrow 109 110 111 movd (%eax), %mm3 C up 112 movd (%edx), %mm4 C rp 113 114 add $-1, %ecx 115 paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 116 pmuludq %mm7, %mm3 117 jnz L(gt1) 118 psubq %mm3, %mm4 C prod 119 paddq %mm4, %mm0 C borrow 120 movd %mm0, (%edx) C result 121 jmp L(rt) 122 123 L(gt1): movd 4(%eax), %mm1 C up 124 movd 4(%edx), %mm2 C rp 125 126 add $-1, %ecx 127 jz L(eev) 128 129 ALIGN(16) 130 L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 131 pmuludq %mm7, %mm1 132 psubq %mm3, %mm4 C prod 133 movd 8(%eax), %mm3 C up 134 paddq %mm4, %mm0 C borrow 135 movd 8(%edx), %mm4 C rp 136 movd %mm0, (%edx) C result 137 psrlq $32, %mm0 138 139 add $-1, %ecx 140 jz L(eod) 141 142 paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 143 pmuludq %mm7, %mm3 144 psubq %mm1, %mm2 C prod 145 movd 12(%eax), %mm1 C up 146 paddq %mm2, %mm0 C borrow 147 movd 12(%edx), %mm2 C rp 148 movd %mm0, 4(%edx) C result 149 psrlq $32, %mm0 150 151 lea 8(%eax), %eax 152 lea 8(%edx), %edx 153 add $-1, %ecx 154 jnz L(top) 155 156 157 L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 158 pmuludq %mm7, %mm1 159 psubq %mm3, %mm4 C prod 160 paddq %mm4, %mm0 C borrow 161 movd %mm0, (%edx) C result 162 psrlq $32, %mm0 163 psubq %mm1, %mm2 C prod 164 paddq %mm2, %mm0 C borrow 165 movd %mm0, 4(%edx) C result 166 L(rt): psrlq $32, %mm0 167 movd %mm0, %eax 168 not %eax 169 emms 170 ret 171 172 L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 173 pmuludq %mm7, %mm3 174 psubq %mm1, %mm2 C prod 175 paddq %mm2, %mm0 C borrow 176 movd %mm0, 4(%edx) C result 177 psrlq $32, %mm0 178 psubq %mm3, %mm4 C prod 179 paddq %mm4, %mm0 C borrow 180 movd %mm0, 8(%edx) C result 181 jmp L(rt) 182 EPILOGUE()