github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/popham.asm (about) 1 dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and 2 dnl hamming distance. 3 4 dnl Copyright 2000-2002 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 35 C popcount hamdist 36 C K6-2: 9.0 11.5 cycles/limb 37 C K6: 12.5 13.0 38 39 40 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); 41 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); 42 C 43 C The code here isn't optimal, but it's already a 2x speedup over the plain 44 C integer mpn/generic/popcount.c,hamdist.c. 45 46 47 ifdef(`OPERATION_popcount',, 48 `ifdef(`OPERATION_hamdist',, 49 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist 50 ')m4exit(1)')') 51 52 define(HAM, 53 m4_assert_numargs(1) 54 `ifdef(`OPERATION_hamdist',`$1')') 55 56 define(POP, 57 m4_assert_numargs(1) 58 `ifdef(`OPERATION_popcount',`$1')') 59 60 HAM(` 61 defframe(PARAM_SIZE, 12) 62 defframe(PARAM_SRC2, 8) 63 defframe(PARAM_SRC, 4) 64 define(M4_function,mpn_hamdist) 65 ') 66 POP(` 67 defframe(PARAM_SIZE, 8) 68 defframe(PARAM_SRC, 4) 69 define(M4_function,mpn_popcount) 70 ') 71 72 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 73 74 75 ifdef(`PIC',,` 76 dnl non-PIC 77 78 RODATA 79 ALIGN(8) 80 81 L(rodata_AAAAAAAAAAAAAAAA): 82 .long 0xAAAAAAAA 83 .long 0xAAAAAAAA 84 85 L(rodata_3333333333333333): 86 .long 0x33333333 87 .long 0x33333333 88 89 L(rodata_0F0F0F0F0F0F0F0F): 90 .long 0x0F0F0F0F 91 .long 0x0F0F0F0F 92 93 L(rodata_000000FF000000FF): 94 .long 0x000000FF 95 .long 0x000000FF 96 ') 97 98 TEXT 99 ALIGN(32) 100 101 POP(`ifdef(`PIC', ` 102 C avoid shrl crossing a 32-byte boundary 103 nop')') 104 105 PROLOGUE(M4_function) 106 deflit(`FRAME',0) 107 108 movl PARAM_SIZE, %ecx 109 110 ifdef(`PIC',` 111 movl $0xAAAAAAAA, %eax 112 movl $0x33333333, %edx 113 114 movd %eax, %mm7 115 movd %edx, %mm6 116 117 movl $0x0F0F0F0F, %eax 118 movl $0x000000FF, %edx 119 120 punpckldq %mm7, %mm7 121 punpckldq %mm6, %mm6 122 123 movd %eax, %mm5 124 movd %edx, %mm4 125 126 punpckldq %mm5, %mm5 127 punpckldq %mm4, %mm4 128 ',` 129 130 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 131 movq L(rodata_3333333333333333), %mm6 132 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 133 movq L(rodata_000000FF000000FF), %mm4 134 ') 135 136 define(REG_AAAAAAAAAAAAAAAA, %mm7) 137 define(REG_3333333333333333, %mm6) 138 define(REG_0F0F0F0F0F0F0F0F, %mm5) 139 define(REG_000000FF000000FF, %mm4) 140 141 142 movl PARAM_SRC, %eax 143 HAM(` movl PARAM_SRC2, %edx') 144 145 pxor %mm2, %mm2 C total 146 147 shrl %ecx 148 jnc L(top) 149 150 Zdisp( movd, 0,(%eax,%ecx,8), %mm1) 151 152 HAM(` 153 Zdisp( movd, 0,(%edx,%ecx,8), %mm0) 154 pxor %mm0, %mm1 155 ') 156 157 incl %ecx 158 jmp L(loaded) 159 160 161 ALIGN(16) 162 POP(` nop C alignment to avoid crossing 32-byte boundaries') 163 164 L(top): 165 C eax src 166 C ebx 167 C ecx counter, qwords, decrementing 168 C edx [hamdist] src2 169 C 170 C mm0 (scratch) 171 C mm1 (scratch) 172 C mm2 total (low dword) 173 C mm3 174 C mm4 \ 175 C mm5 | special constants 176 C mm6 | 177 C mm7 / 178 179 movq -8(%eax,%ecx,8), %mm1 180 HAM(` pxor -8(%edx,%ecx,8), %mm1') 181 182 L(loaded): 183 movq %mm1, %mm0 184 pand REG_AAAAAAAAAAAAAAAA, %mm1 185 186 psrlq $1, %mm1 187 HAM(` nop C code alignment') 188 189 psubd %mm1, %mm0 C bit pairs 190 HAM(` nop C code alignment') 191 192 193 movq %mm0, %mm1 194 psrlq $2, %mm0 195 196 pand REG_3333333333333333, %mm0 197 pand REG_3333333333333333, %mm1 198 199 paddd %mm1, %mm0 C nibbles 200 201 202 movq %mm0, %mm1 203 psrlq $4, %mm0 204 205 pand REG_0F0F0F0F0F0F0F0F, %mm0 206 pand REG_0F0F0F0F0F0F0F0F, %mm1 207 208 paddd %mm1, %mm0 C bytes 209 210 movq %mm0, %mm1 211 psrlq $8, %mm0 212 213 214 paddb %mm1, %mm0 C words 215 216 217 movq %mm0, %mm1 218 psrlq $16, %mm0 219 220 paddd %mm1, %mm0 C dwords 221 222 pand REG_000000FF000000FF, %mm0 223 224 paddd %mm0, %mm2 C low to total 225 psrlq $32, %mm0 226 227 paddd %mm0, %mm2 C high to total 228 loop L(top) 229 230 231 232 movd %mm2, %eax 233 emms_or_femms 234 ret 235 236 EPILOGUE()