github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/popham.asm (about) 1 dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming 2 dnl distance. 3 4 dnl Copyright 2000-2002 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 35 C popcount hamdist 36 C P3 generic 6.5 7 37 C P3 model 9 (Banias) 5.7 6.1 38 C P3 model 13 (Dothan) 5.75 6 39 C K7 5 6 40 41 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); 42 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); 43 C 44 C The code here is almost certainly not optimal, but is already a 3x speedup 45 C over the generic C code. The main improvement would be to interleave 46 C processing of two qwords in the loop so as to fully exploit the available 47 C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs). 48 C 49 C The loop is based on the example "Efficient 64-bit population count using 50 C MMX instructions" in the Athlon Optimization Guide, AMD document 22007, 51 C page 158 of rev E (reference in mpn/x86/k7/README). 52 53 ifdef(`OPERATION_popcount',, 54 `ifdef(`OPERATION_hamdist',, 55 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined 56 ')')') 57 58 define(HAM, 59 m4_assert_numargs(1) 60 `ifdef(`OPERATION_hamdist',`$1')') 61 62 define(POP, 63 m4_assert_numargs(1) 64 `ifdef(`OPERATION_popcount',`$1')') 65 66 HAM(` 67 defframe(PARAM_SIZE, 12) 68 defframe(PARAM_SRC2, 8) 69 defframe(PARAM_SRC, 4) 70 define(M4_function,mpn_hamdist) 71 ') 72 POP(` 73 defframe(PARAM_SIZE, 8) 74 defframe(PARAM_SRC, 4) 75 define(M4_function,mpn_popcount) 76 ') 77 78 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 79 80 81 ifdef(`PIC',,` 82 dnl non-PIC 83 84 RODATA 85 ALIGN(8) 86 87 L(rodata_AAAAAAAAAAAAAAAA): 88 .long 0xAAAAAAAA 89 .long 0xAAAAAAAA 90 91 L(rodata_3333333333333333): 92 .long 0x33333333 93 .long 0x33333333 94 95 L(rodata_0F0F0F0F0F0F0F0F): 96 .long 0x0F0F0F0F 97 .long 0x0F0F0F0F 98 ') 99 100 TEXT 101 ALIGN(32) 102 103 PROLOGUE(M4_function) 104 deflit(`FRAME',0) 105 106 movl PARAM_SIZE, %ecx 107 108 ifdef(`PIC',` 109 movl $0xAAAAAAAA, %eax 110 movl $0x33333333, %edx 111 112 movd %eax, %mm7 113 movd %edx, %mm6 114 115 movl $0x0F0F0F0F, %eax 116 117 punpckldq %mm7, %mm7 118 punpckldq %mm6, %mm6 119 120 movd %eax, %mm5 121 movd %edx, %mm4 122 123 punpckldq %mm5, %mm5 124 125 ',` 126 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 127 movq L(rodata_3333333333333333), %mm6 128 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 129 ') 130 pxor %mm4, %mm4 131 132 define(REG_AAAAAAAAAAAAAAAA,%mm7) 133 define(REG_3333333333333333,%mm6) 134 define(REG_0F0F0F0F0F0F0F0F,%mm5) 135 define(REG_0000000000000000,%mm4) 136 137 138 movl PARAM_SRC, %eax 139 HAM(` movl PARAM_SRC2, %edx') 140 141 pxor %mm2, %mm2 C total 142 143 shrl %ecx 144 jnc L(top) 145 146 movd (%eax,%ecx,8), %mm1 147 148 HAM(` movd (%edx,%ecx,8), %mm0 149 pxor %mm0, %mm1 150 ') 151 orl %ecx, %ecx 152 jmp L(loaded) 153 154 155 ALIGN(16) 156 L(top): 157 C eax src 158 C ebx 159 C ecx counter, qwords, decrementing 160 C edx [hamdist] src2 161 C 162 C mm0 (scratch) 163 C mm1 (scratch) 164 C mm2 total (low dword) 165 C mm3 166 C mm4 \ 167 C mm5 | special constants 168 C mm6 | 169 C mm7 / 170 171 movq -8(%eax,%ecx,8), %mm1 172 173 HAM(` pxor -8(%edx,%ecx,8), %mm1') 174 decl %ecx 175 176 L(loaded): 177 movq %mm1, %mm0 178 pand REG_AAAAAAAAAAAAAAAA, %mm1 179 180 psrlq $1, %mm1 181 182 psubd %mm1, %mm0 C bit pairs 183 184 185 movq %mm0, %mm1 186 psrlq $2, %mm0 187 188 pand REG_3333333333333333, %mm0 189 pand REG_3333333333333333, %mm1 190 191 paddd %mm1, %mm0 C nibbles 192 193 194 movq %mm0, %mm1 195 psrlq $4, %mm0 196 197 pand REG_0F0F0F0F0F0F0F0F, %mm0 198 pand REG_0F0F0F0F0F0F0F0F, %mm1 199 200 paddd %mm1, %mm0 C bytes 201 202 203 psadbw( %mm4, %mm0) 204 205 paddd %mm0, %mm2 C add to total 206 jnz L(top) 207 208 209 movd %mm2, %eax 210 emms 211 ret 212 213 EPILOGUE()