github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/lshsub_n.asm (about) 1 dnl Intel P6 mpn_lshsub_n -- mpn papillion support. 2 3 dnl Copyright 2006 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12) 34 35 C (1) The loop is not scheduled in any way, and scheduling attempts have not 36 C improved speed on P6/13. Presumably, the K7 will want scheduling, if it 37 C at all wants to use MMX. 38 C (2) We could save a register by not alternatingly using eax and edx in the 39 C loop. 40 41 define(`rp', `%edi') 42 define(`up', `%esi') 43 define(`vp', `%ebx') 44 define(`n', `%ecx') 45 define(`cnt', `%mm7') 46 47 ASM_START() 48 49 TEXT 50 ALIGN(16) 51 52 PROLOGUE(mpn_lshsub_n) 53 push %edi 54 push %esi 55 push %ebx 56 57 mov 16(%esp), rp 58 mov 20(%esp), up 59 mov 24(%esp), vp 60 mov 28(%esp), n 61 mov $32, %eax 62 sub 32(%esp), %eax 63 movd %eax, cnt 64 65 lea (up,n,4), up 66 lea (vp,n,4), vp 67 lea (rp,n,4), rp 68 69 neg n 70 mov n, %eax 71 and $-8, n 72 and $7, %eax 73 shl %eax C eax = 2x 74 lea (%eax,%eax,4), %edx C edx = 10x 75 ifdef(`PIC',` 76 call L(pic_calc) 77 L(here): 78 ',` 79 lea L(ent)(%eax,%edx,2), %eax C eax = 22x 80 ') 81 82 pxor %mm1, %mm1 83 pxor %mm0, %mm0 84 85 jmp *%eax 86 87 ifdef(`PIC',` 88 L(pic_calc): 89 C See mpn/x86/README about old gas bugs 90 lea (%eax,%edx,2), %eax 91 add $L(ent)-L(here), %eax 92 add (%esp), %eax 93 ret_internal 94 ') 95 96 L(end): C compute (cy<<cnt) | (edx>>(32-cnt)) 97 sbb %eax, %eax 98 neg %eax 99 mov 32(%esp), %ecx 100 shld %cl, %edx, %eax 101 102 emms 103 104 pop %ebx 105 pop %esi 106 pop %edi 107 ret 108 ALIGN(16) 109 L(top): jecxz L(end) 110 L(ent): mov 0(up,n,4), %eax 111 sbb 0(vp,n,4), %eax 112 movd %eax, %mm0 113 punpckldq %mm0, %mm1 114 psrlq %mm7, %mm1 115 movd %mm1, 0(rp,n,4) 116 117 mov 4(up,n,4), %edx 118 sbb 4(vp,n,4), %edx 119 movd %edx, %mm1 120 punpckldq %mm1, %mm0 121 psrlq %mm7, %mm0 122 movd %mm0, 4(rp,n,4) 123 124 mov 8(up,n,4), %eax 125 sbb 8(vp,n,4), %eax 126 movd %eax, %mm0 127 punpckldq %mm0, %mm1 128 psrlq %mm7, %mm1 129 movd %mm1, 8(rp,n,4) 130 131 mov 12(up,n,4), %edx 132 sbb 12(vp,n,4), %edx 133 movd %edx, %mm1 134 punpckldq %mm1, %mm0 135 psrlq %mm7, %mm0 136 movd %mm0, 12(rp,n,4) 137 138 mov 16(up,n,4), %eax 139 sbb 16(vp,n,4), %eax 140 movd %eax, %mm0 141 punpckldq %mm0, %mm1 142 psrlq %mm7, %mm1 143 movd %mm1, 16(rp,n,4) 144 145 mov 20(up,n,4), %edx 146 sbb 20(vp,n,4), %edx 147 movd %edx, %mm1 148 punpckldq %mm1, %mm0 149 psrlq %mm7, %mm0 150 movd %mm0, 20(rp,n,4) 151 152 mov 24(up,n,4), %eax 153 sbb 24(vp,n,4), %eax 154 movd %eax, %mm0 155 punpckldq %mm0, %mm1 156 psrlq %mm7, %mm1 157 movd %mm1, 24(rp,n,4) 158 159 mov 28(up,n,4), %edx 160 sbb 28(vp,n,4), %edx 161 movd %edx, %mm1 162 punpckldq %mm1, %mm0 163 psrlq %mm7, %mm0 164 movd %mm0, 28(rp,n,4) 165 166 lea 8(n), n 167 jmp L(top) 168 169 EPILOGUE()