github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/addmul_1.asm (about) 1 dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3 dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 32 include(`../config.m4') 33 34 C cycles/limb 35 C P6 model 0-8,10-12 - 36 C P6 model 9 (Banias) 5.24 37 C P6 model 13 (Dothan) 5.24 38 C P4 model 0-1 (Willamette) 5 39 C P4 model 2 (Northwood) 5 40 C P4 model 3-4 (Prescott) 5 41 42 C TODO: 43 C * Tweak eax/edx offsets in loop as to save some lea's 44 C * Perhaps software pipeline small-case code 45 46 C INPUT PARAMETERS 47 C rp sp + 4 48 C up sp + 8 49 C n sp + 12 50 C v0 sp + 16 51 52 TEXT 53 ALIGN(16) 54 PROLOGUE(mpn_addmul_1) 55 pxor %mm6, %mm6 56 L(ent): mov 4(%esp), %edx 57 mov 8(%esp), %eax 58 mov 12(%esp), %ecx 59 movd 16(%esp), %mm7 60 cmp $4, %ecx 61 jnc L(big) 62 63 L(lp0): movd (%eax), %mm0 64 lea 4(%eax), %eax 65 movd (%edx), %mm4 66 lea 4(%edx), %edx 67 pmuludq %mm7, %mm0 68 paddq %mm0, %mm4 69 paddq %mm4, %mm6 70 movd %mm6, -4(%edx) 71 psrlq $32, %mm6 72 dec %ecx 73 jnz L(lp0) 74 movd %mm6, %eax 75 emms 76 ret 77 78 L(big): and $3, %ecx 79 je L(0) 80 cmp $2, %ecx 81 jc L(1) 82 je L(2) 83 jmp L(3) C FIXME: one case should fall through 84 85 L(0): movd (%eax), %mm3 86 sub 12(%esp), %ecx C loop count 87 lea -16(%eax), %eax 88 lea -12(%edx), %edx 89 pmuludq %mm7, %mm3 90 movd 20(%eax), %mm0 91 movd 12(%edx), %mm5 92 pmuludq %mm7, %mm0 93 movd 24(%eax), %mm1 94 paddq %mm3, %mm5 95 movd 16(%edx), %mm4 96 jmp L(00) 97 98 L(1): movd (%eax), %mm2 99 sub 12(%esp), %ecx 100 lea -12(%eax), %eax 101 lea -8(%edx), %edx 102 movd 8(%edx), %mm4 103 pmuludq %mm7, %mm2 104 movd 16(%eax), %mm3 105 pmuludq %mm7, %mm3 106 movd 20(%eax), %mm0 107 paddq %mm2, %mm4 108 movd 12(%edx), %mm5 109 jmp L(01) 110 111 L(2): movd (%eax), %mm1 112 sub 12(%esp), %ecx 113 lea -8(%eax), %eax 114 lea -4(%edx), %edx 115 pmuludq %mm7, %mm1 116 movd 12(%eax), %mm2 117 movd 4(%edx), %mm5 118 pmuludq %mm7, %mm2 119 movd 16(%eax), %mm3 120 paddq %mm1, %mm5 121 movd 8(%edx), %mm4 122 jmp L(10) 123 124 L(3): movd (%eax), %mm0 125 sub 12(%esp), %ecx 126 lea -4(%eax), %eax 127 pmuludq %mm7, %mm0 128 movd 8(%eax), %mm1 129 movd (%edx), %mm4 130 pmuludq %mm7, %mm1 131 movd 12(%eax), %mm2 132 paddq %mm0, %mm4 133 movd 4(%edx), %mm5 134 135 ALIGN(16) 136 L(top): pmuludq %mm7, %mm2 137 paddq %mm4, %mm6 138 movd 16(%eax), %mm3 139 paddq %mm1, %mm5 140 movd 8(%edx), %mm4 141 movd %mm6, 0(%edx) 142 psrlq $32, %mm6 143 L(10): pmuludq %mm7, %mm3 144 paddq %mm5, %mm6 145 movd 20(%eax), %mm0 146 paddq %mm2, %mm4 147 movd 12(%edx), %mm5 148 movd %mm6, 4(%edx) 149 psrlq $32, %mm6 150 L(01): pmuludq %mm7, %mm0 151 paddq %mm4, %mm6 152 movd 24(%eax), %mm1 153 paddq %mm3, %mm5 154 movd 16(%edx), %mm4 155 movd %mm6, 8(%edx) 156 psrlq $32, %mm6 157 L(00): pmuludq %mm7, %mm1 158 paddq %mm5, %mm6 159 movd 28(%eax), %mm2 160 paddq %mm0, %mm4 161 movd 20(%edx), %mm5 162 movd %mm6, 12(%edx) 163 psrlq $32, %mm6 164 lea 16(%eax), %eax 165 lea 16(%edx), %edx 166 add $4, %ecx 167 jnz L(top) 168 169 L(end): pmuludq %mm7, %mm2 170 paddq %mm4, %mm6 171 paddq %mm1, %mm5 172 movd 8(%edx), %mm4 173 movd %mm6, 0(%edx) 174 psrlq $32, %mm6 175 paddq %mm5, %mm6 176 paddq %mm2, %mm4 177 movd %mm6, 4(%edx) 178 psrlq $32, %mm6 179 paddq %mm4, %mm6 180 movd %mm6, 8(%edx) 181 psrlq $32, %mm6 182 movd %mm6, %eax 183 emms 184 ret 185 EPILOGUE() 186 PROLOGUE(mpn_addmul_1c) 187 movd 20(%esp), %mm6 188 jmp L(ent) 189 EPILOGUE()