github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/addlsh1_n.asm (about) 1 dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y. 2 3 dnl Copyright 2001-2004, 2006 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C dst!=src1,2 dst==src1 dst==src2 36 C P6 model 0-8,10-12 - 37 C P6 model 9 (Banias) ? 38 C P6 model 13 (Dothan) ? 39 C P4 model 0-1 (Willamette) ? 40 C P4 model 2 (Northwood) 4.25 6 6 41 C P4 model 3-4 (Prescott) 5 8.5 8.5 42 43 C The slightly strange combination of indexing and pointer incrementing 44 C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or 45 C src2 is a slowdown. 46 C 47 C The dependent chain is simply the paddq of x+2*y to the previous carry, 48 C then psrlq to get the new carry. That makes 4 c/l the target speed, which 49 C is almost achieved for separate src/dst but when src==dst the write 50 C combining anomalies slow it down. 51 52 defframe(PARAM_SIZE, 16) 53 defframe(PARAM_SRC2, 12) 54 defframe(PARAM_SRC1, 8) 55 defframe(PARAM_DST, 4) 56 57 dnl re-use parameter space 58 define(SAVE_EBX,`PARAM_SRC1') 59 60 TEXT 61 ALIGN(8) 62 63 PROLOGUE(mpn_addlsh1_n) 64 deflit(`FRAME',0) 65 66 mov PARAM_SRC1, %eax 67 mov %ebx, SAVE_EBX 68 69 mov PARAM_SRC2, %ebx 70 pxor %mm0, %mm0 C initial carry 71 72 mov PARAM_DST, %edx 73 74 mov PARAM_SIZE, %ecx 75 76 lea (%edx,%ecx,4), %edx C dst end 77 neg %ecx C -size 78 79 L(top): 80 C eax src1 end 81 C ebx src2 end 82 C ecx counter, limbs, negative 83 C edx dst end 84 C mm0 carry 85 86 movd (%ebx), %mm2 87 movd (%eax), %mm1 88 psrlq $32, %mm0 89 lea 4(%eax), %eax 90 lea 4(%ebx), %ebx 91 92 psllq $1, %mm2 93 paddq %mm2, %mm1 94 95 paddq %mm1, %mm0 96 97 movd %mm0, (%edx,%ecx,4) 98 add $1, %ecx 99 jnz L(top) 100 101 102 psrlq $32, %mm0 103 mov SAVE_EBX, %ebx 104 movd %mm0, %eax 105 emms 106 ret 107 108 EPILOGUE()