github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/mod_34lsub1.asm (about) 1 dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1. 2 3 dnl Copyright 2000-2003 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C Pentium4: 1.0 cycles/limb 35 36 37 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) 38 C 39 C Enhancements: 40 C 41 C There might a couple of cycles to save by using plain integer code for 42 C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to 43 C about 46 (inclusive of some function call overheads). 44 45 defframe(PARAM_SIZE, 8) 46 defframe(PARAM_SRC, 4) 47 48 dnl re-use parameter space 49 define(SAVE_EBX, `PARAM_SRC') 50 define(SAVE_ESI, `PARAM_SIZE') 51 52 TEXT 53 ALIGN(16) 54 PROLOGUE(mpn_mod_34lsub1) 55 deflit(`FRAME',0) 56 57 movl PARAM_SIZE, %ecx 58 movl PARAM_SRC, %edx 59 movl (%edx), %eax 60 61 subl $2, %ecx 62 ja L(three_or_more) 63 jne L(one) 64 65 movl 4(%edx), %edx 66 movl %eax, %ecx 67 shrl $24, %eax C src[0] high 68 69 andl $0x00FFFFFF, %ecx C src[0] low 70 addl %ecx, %eax 71 72 movl %edx, %ecx 73 shll $8, %edx 74 75 shrl $16, %ecx C src[1] low 76 addl %ecx, %eax 77 78 andl $0x00FFFF00, %edx C src[1] high 79 addl %edx, %eax 80 81 L(one): 82 ret 83 84 85 L(three_or_more): 86 pxor %mm0, %mm0 87 pxor %mm1, %mm1 88 pxor %mm2, %mm2 89 90 pcmpeqd %mm7, %mm7 91 psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits 92 93 pcmpeqd %mm6, %mm6 94 psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits 95 96 L(top): 97 C eax 98 C ebx 99 C ecx counter, size-2 to 0, -1 or -2 100 C edx src, incrementing 101 C 102 C mm0 sum 0mod3 103 C mm1 sum 1mod3 104 C mm2 sum 2mod3 105 C mm3 106 C mm4 107 C mm5 108 C mm6 0x0000000000FFFFFF 109 C mm7 0x00000000FFFFFFFF 110 111 movd (%edx), %mm3 112 paddq %mm3, %mm0 113 114 movd 4(%edx), %mm3 115 paddq %mm3, %mm1 116 117 movd 8(%edx), %mm3 118 paddq %mm3, %mm2 119 120 addl $12, %edx 121 subl $3, %ecx 122 ja L(top) 123 124 125 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively 126 127 addl $1, %ecx 128 js L(combine) C 0 more 129 130 movd (%edx), %mm3 131 paddq %mm3, %mm0 132 133 jz L(combine) C 1 more 134 135 movd 4(%edx), %mm3 136 paddq %mm3, %mm1 137 138 L(combine): 139 movq %mm7, %mm3 C low halves 140 pand %mm0, %mm3 141 142 movq %mm7, %mm4 143 pand %mm1, %mm4 144 145 movq %mm7, %mm5 146 pand %mm2, %mm5 147 148 psrlq $32, %mm0 C high halves 149 psrlq $32, %mm1 150 psrlq $32, %mm2 151 152 paddq %mm0, %mm4 C fold high halves to give 33 bits each 153 paddq %mm1, %mm5 154 paddq %mm2, %mm3 155 156 psllq $8, %mm4 C combine at respective offsets 157 psllq $16, %mm5 158 paddq %mm4, %mm3 159 paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits 160 161 pand %mm3, %mm6 C fold at 24 bits 162 psrlq $24, %mm3 163 164 paddq %mm6, %mm3 165 movd %mm3, %eax 166 167 ASSERT(z, C nothing left in high dword 168 `psrlq $32, %mm3 169 movd %mm3, %ecx 170 orl %ecx, %ecx') 171 172 emms 173 ret 174 175 EPILOGUE()