github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/mul_1.asm (about) 1 dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3 dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 32 include(`../config.m4') 33 34 C cycles/limb 35 C P6 model 0-8,10-12 - 36 C P6 model 9 (Banias) 4.17 37 C P6 model 13 (Dothan) 4.17 38 C P4 model 0-1 (Willamette) 4 39 C P4 model 2 (Northwood) 4 40 C P4 model 3-4 (Prescott) 4.55 41 42 C TODO: 43 C * Tweak eax/edx offsets in loop as to save some lea's 44 C * Perhaps software pipeline small-case code 45 46 C INPUT PARAMETERS 47 C rp sp + 4 48 C up sp + 8 49 C n sp + 12 50 C v0 sp + 16 51 52 TEXT 53 ALIGN(16) 54 PROLOGUE(mpn_mul_1) 55 pxor %mm6, %mm6 56 L(ent): mov 4(%esp), %edx 57 mov 8(%esp), %eax 58 mov 12(%esp), %ecx 59 movd 16(%esp), %mm7 60 cmp $4, %ecx 61 jnc L(big) 62 63 L(lp0): movd (%eax), %mm0 64 lea 4(%eax), %eax 65 lea 4(%edx), %edx 66 pmuludq %mm7, %mm0 67 paddq %mm0, %mm6 68 movd %mm6, -4(%edx) 69 psrlq $32, %mm6 70 dec %ecx 71 jnz L(lp0) 72 movd %mm6, %eax 73 emms 74 ret 75 76 L(big): and $3, %ecx 77 je L(0) 78 cmp $2, %ecx 79 jc L(1) 80 je L(2) 81 jmp L(3) C FIXME: one case should fall through 82 83 L(0): movd (%eax), %mm3 84 sub 12(%esp), %ecx C loop count 85 lea -16(%eax), %eax 86 lea -12(%edx), %edx 87 pmuludq %mm7, %mm3 88 movd 20(%eax), %mm0 89 pmuludq %mm7, %mm0 90 movd 24(%eax), %mm1 91 jmp L(00) 92 93 L(1): movd (%eax), %mm2 94 sub 12(%esp), %ecx 95 lea -12(%eax), %eax 96 lea -8(%edx), %edx 97 pmuludq %mm7, %mm2 98 movd 16(%eax), %mm3 99 pmuludq %mm7, %mm3 100 movd 20(%eax), %mm0 101 jmp L(01) 102 103 L(2): movd (%eax), %mm1 104 sub 12(%esp), %ecx 105 lea -8(%eax), %eax 106 lea -4(%edx), %edx 107 pmuludq %mm7, %mm1 108 movd 12(%eax), %mm2 109 pmuludq %mm7, %mm2 110 movd 16(%eax), %mm3 111 jmp L(10) 112 113 L(3): movd (%eax), %mm0 114 sub 12(%esp), %ecx 115 lea -4(%eax), %eax 116 pmuludq %mm7, %mm0 117 movd 8(%eax), %mm1 118 pmuludq %mm7, %mm1 119 movd 12(%eax), %mm2 120 121 ALIGN(16) 122 L(top): pmuludq %mm7, %mm2 123 paddq %mm0, %mm6 124 movd 16(%eax), %mm3 125 movd %mm6, 0(%edx) 126 psrlq $32, %mm6 127 L(10): pmuludq %mm7, %mm3 128 paddq %mm1, %mm6 129 movd 20(%eax), %mm0 130 movd %mm6, 4(%edx) 131 psrlq $32, %mm6 132 L(01): pmuludq %mm7, %mm0 133 paddq %mm2, %mm6 134 movd 24(%eax), %mm1 135 movd %mm6, 8(%edx) 136 psrlq $32, %mm6 137 L(00): pmuludq %mm7, %mm1 138 paddq %mm3, %mm6 139 movd 28(%eax), %mm2 140 movd %mm6, 12(%edx) 141 psrlq $32, %mm6 142 lea 16(%eax), %eax 143 lea 16(%edx), %edx 144 add $4, %ecx 145 ja L(top) 146 147 L(end): pmuludq %mm7, %mm2 148 paddq %mm0, %mm6 149 movd %mm6, 0(%edx) 150 psrlq $32, %mm6 151 paddq %mm1, %mm6 152 movd %mm6, 4(%edx) 153 psrlq $32, %mm6 154 paddq %mm2, %mm6 155 movd %mm6, 8(%edx) 156 psrlq $32, %mm6 157 movd %mm6, %eax 158 emms 159 ret 160 EPILOGUE() 161 PROLOGUE(mpn_mul_1c) 162 movd 20(%esp), %mm6 163 jmp L(ent) 164 EPILOGUE()