github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/popcount.asm (about) 1 dnl X86-32 and X86-64 mpn_popcount using SSE2. 2 3 dnl Copyright 2006, 2007, 2011, 2015 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 32 include(`../config.m4') 33 34 35 C 32-bit popcount hamdist 36 C cycles/limb cycles/limb 37 C P5 - 38 C P6 model 0-8,10-12 - 39 C P6 model 9 (Banias) ? 40 C P6 model 13 (Dothan) 4 41 C P4 model 0 (Willamette) ? 42 C P4 model 1 (?) ? 43 C P4 model 2 (Northwood) 3.9 44 C P4 model 3 (Prescott) ? 45 C P4 model 4 (Nocona) ? 46 C AMD K6 - 47 C AMD K7 - 48 C AMD K8 ? 49 50 C 64-bit popcount hamdist 51 C cycles/limb cycles/limb 52 C P4 model 4 (Nocona): 8 53 C AMD K8,K9 7.5 54 C AMD K10 3.5 55 C Intel core2 3.68 56 C Intel corei 3.15 57 C Intel atom 10.8 58 C VIA nano 6.5 59 60 C TODO 61 C * Make an mpn_hamdist based on this. Alignment could either be handled by 62 C using movdqu for one operand and movdqa for the other, or by painfully 63 C shifting as we go. Unfortunately, there seem to be no usable shift 64 C instruction, except for one that takes an immediate count. 65 C * It would probably be possible to cut a few cycles/limb using software 66 C pipelining. 67 C * There are 35 decode slots unused by the SSE2 instructions. Loop control 68 C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel 69 C integer based popcount. Such a combined loop would handle 6 limbs in 70 C about 30 cycles on K8. 71 C * We could save a byte or two by using 32-bit operations on areg. 72 C * Check if using movdqa to a temp of and then register-based pand is faster. 73 74 ifelse(GMP_LIMB_BITS,`32', 75 ` define(`up', `%edx') 76 define(`n', `%ecx') 77 define(`areg',`%eax') 78 define(`breg',`%ebx') 79 define(`zero',`%xmm4') 80 define(`LIMB32',` $1') 81 define(`LIMB64',`dnl') 82 ',` 83 define(`up', `%rdi') 84 define(`n', `%rsi') 85 define(`areg',`%rax') 86 define(`breg',`%rdx') 87 define(`zero',`%xmm8') 88 define(`LIMB32',`dnl') 89 define(`LIMB64',` $1') 90 ') 91 92 define(`mm01010101',`%xmm6') 93 define(`mm00110011',`%xmm7') 94 define(`mm00001111',`%xmm2') 95 96 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 97 define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES)) 98 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES)) 99 100 undefine(`psadbw') C override inherited m4 version 101 102 C This file is shared between 32-bit and 64-bit builds. Only the former has 103 C LEAL. Default LEAL as an alias of LEA. 104 ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')') 105 106 ASM_START() 107 108 C Make cnsts global to work around Apple relocation bug. 109 ifdef(`DARWIN',` 110 define(`cnsts', MPN(popccnsts)) 111 GLOBL cnsts') 112 113 TEXT 114 ALIGN(32) 115 PROLOGUE(mpn_popcount) 116 117 LIMB32(`mov 4(%esp), up ') 118 LIMB32(`mov 8(%esp), n ') 119 LIMB32(`push %ebx ') 120 121 pxor %xmm3, %xmm3 C zero grand total count 122 LIMB64(`pxor zero, zero ') 123 ifdef(`PIC',` 124 LEAL( cnsts, breg) 125 ',` 126 LIMB32(`mov $cnsts, breg ') 127 LIMB64(`movabs $cnsts, breg ') 128 ') 129 130 movdqa -48(breg), mm01010101 131 movdqa -32(breg), mm00110011 132 movdqa -16(breg), mm00001111 133 134 mov up, areg 135 and $-16, up C round `up' down to 128-bit boundary 136 and $12, areg C 32:areg = 0, 4, 8, 12 137 C 64:areg = 0, 8 138 movdqa (up), %xmm0 139 pand 64(breg,areg,4), %xmm0 140 shr $m4_log2(GMP_LIMB_BYTES), %eax 141 add areg, n C compensate n for rounded down `up' 142 143 pxor %xmm4, %xmm4 144 sub $LIMBS_PER_XMM, n 145 jbe L(sum) 146 147 sub $LIMBS_PER_XMM, n 148 ja L(ent) 149 jmp L(lsum) 150 151 ALIGN(16) 152 L(top): movdqa (up), %xmm0 153 L(ent): movdqa 16(up), %xmm4 154 155 movdqa %xmm0, %xmm1 156 movdqa %xmm4, %xmm5 157 psrld $1, %xmm0 158 psrld $1, %xmm4 159 pand mm01010101, %xmm0 160 pand mm01010101, %xmm4 161 psubd %xmm0, %xmm1 162 psubd %xmm4, %xmm5 163 164 movdqa %xmm1, %xmm0 165 movdqa %xmm5, %xmm4 166 psrlq $2, %xmm1 167 psrlq $2, %xmm5 168 pand mm00110011, %xmm0 169 pand mm00110011, %xmm4 170 pand mm00110011, %xmm1 171 pand mm00110011, %xmm5 172 paddq %xmm0, %xmm1 173 paddq %xmm4, %xmm5 174 175 LIMB32(`pxor zero, zero ') 176 177 add $32, up 178 sub $LIMBS_PER_2XMM, n 179 180 paddq %xmm5, %xmm1 181 movdqa %xmm1, %xmm0 182 psrlq $4, %xmm1 183 pand mm00001111, %xmm0 184 pand mm00001111, %xmm1 185 paddq %xmm0, %xmm1 186 187 psadbw zero, %xmm1 188 paddq %xmm1, %xmm3 C add to grand total 189 190 jnc L(top) 191 L(end): 192 add $LIMBS_PER_2XMM, n 193 jz L(rt) 194 movdqa (up), %xmm0 195 pxor %xmm4, %xmm4 196 sub $LIMBS_PER_XMM, n 197 jbe L(sum) 198 L(lsum): 199 movdqa %xmm0, %xmm4 200 movdqa 16(up), %xmm0 201 L(sum): 202 shl $m4_log2(GMP_LIMB_BYTES), n 203 and $12, n 204 pand (breg,n,4), %xmm0 205 206 movdqa %xmm0, %xmm1 207 movdqa %xmm4, %xmm5 208 psrld $1, %xmm0 209 psrld $1, %xmm4 210 pand mm01010101, %xmm0 211 pand mm01010101, %xmm4 212 psubd %xmm0, %xmm1 213 psubd %xmm4, %xmm5 214 215 movdqa %xmm1, %xmm0 216 movdqa %xmm5, %xmm4 217 psrlq $2, %xmm1 218 psrlq $2, %xmm5 219 pand mm00110011, %xmm0 220 pand mm00110011, %xmm4 221 pand mm00110011, %xmm1 222 pand mm00110011, %xmm5 223 paddq %xmm0, %xmm1 224 paddq %xmm4, %xmm5 225 226 LIMB32(`pxor zero, zero ') 227 228 paddq %xmm5, %xmm1 229 movdqa %xmm1, %xmm0 230 psrlq $4, %xmm1 231 pand mm00001111, %xmm0 232 pand mm00001111, %xmm1 233 paddq %xmm0, %xmm1 234 235 psadbw zero, %xmm1 236 paddq %xmm1, %xmm3 C add to grand total 237 238 239 C Add the two 64-bit halves of the grand total counter 240 L(rt): movdqa %xmm3, %xmm0 241 psrldq $8, %xmm3 242 paddq %xmm3, %xmm0 243 movd %xmm0, areg C movq avoided due to gas bug 244 245 LIMB32(`pop %ebx ') 246 ret 247 248 EPILOGUE() 249 DEF_OBJECT(dummy,16) 250 C Three magic constants used for masking out bits 251 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 252 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 253 254 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 255 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 256 257 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 258 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 259 cnsts: 260 C Masks for high end of number 261 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 262 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 263 264 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 265 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 266 267 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 268 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 269 270 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 271 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 272 C Masks for low end of number 273 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 274 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 275 276 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 277 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 278 279 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 280 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 281 282 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 283 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 284 END_OBJECT(dummy) 285 ASM_END()