github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/mmx/popham.asm (about) 1 dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and 2 dnl hamming distance. 3 4 dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 35 C popcount hamdist 36 C P3 model 9 (Banias) ? ? 37 C P3 model 13 (Dothan) 6 6 38 C P4 model 0 (Willamette) 39 C P4 model 1 (?) 40 C P4 model 2 (Northwood) 8 9 41 C P4 model 3 (Prescott) 8 9 42 C P4 model 4 (Nocona) 43 44 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); 45 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); 46 C 47 C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided. 48 C Two movd's and a punpckldq seems to be the same speed as an aligned movq, 49 C and using them saves fiddling about with alignment testing on entry. 50 C 51 C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l 52 C might be possible, but 8 c/l relying on out-of-order execution is already 53 C quite reasonable. 54 55 ifdef(`OPERATION_popcount',, 56 `ifdef(`OPERATION_hamdist',, 57 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined 58 ')')') 59 60 define(HAM, 61 m4_assert_numargs(1) 62 `ifdef(`OPERATION_hamdist',`$1')') 63 64 define(POP, 65 m4_assert_numargs(1) 66 `ifdef(`OPERATION_popcount',`$1')') 67 68 HAM(` 69 defframe(PARAM_SIZE, 12) 70 defframe(PARAM_SRC2, 8) 71 defframe(PARAM_SRC, 4) 72 define(M4_function,mpn_hamdist) 73 ') 74 POP(` 75 defframe(PARAM_SIZE, 8) 76 defframe(PARAM_SRC, 4) 77 define(M4_function,mpn_popcount) 78 ') 79 80 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 81 82 83 ifdef(`PIC',,` 84 dnl non-PIC 85 RODATA 86 ALIGN(8) 87 L(rodata_AAAAAAAAAAAAAAAA): 88 .long 0xAAAAAAAA 89 .long 0xAAAAAAAA 90 L(rodata_3333333333333333): 91 .long 0x33333333 92 .long 0x33333333 93 L(rodata_0F0F0F0F0F0F0F0F): 94 .long 0x0F0F0F0F 95 .long 0x0F0F0F0F 96 ') 97 98 TEXT 99 ALIGN(16) 100 101 PROLOGUE(M4_function) 102 deflit(`FRAME',0) 103 104 movl PARAM_SIZE, %ecx 105 movl PARAM_SRC, %eax 106 107 ifdef(`PIC',` 108 movl $0xAAAAAAAA, %edx 109 movd %edx, %mm7 110 punpckldq %mm7, %mm7 111 112 movl $0x33333333, %edx 113 movd %edx, %mm6 114 punpckldq %mm6, %mm6 115 116 movl $0x0F0F0F0F, %edx 117 movd %edx, %mm5 118 punpckldq %mm5, %mm5 119 120 HAM(` movl PARAM_SRC2, %edx') 121 122 ',` 123 dnl non-PIC 124 HAM(` movl PARAM_SRC2, %edx') 125 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 126 movq L(rodata_3333333333333333), %mm6 127 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 128 ') 129 130 pxor %mm4, %mm4 C zero 131 pxor %mm0, %mm0 C total 132 133 subl $1, %ecx 134 ja L(top) 135 136 L(last): 137 movd (%eax,%ecx,4), %mm1 C src high limb 138 HAM(` movd (%edx,%ecx,4), %mm2 139 pxor %mm2, %mm1 140 ') 141 jmp L(loaded) 142 143 144 L(top): 145 C eax src 146 C ebx 147 C ecx counter, size-1 to 2 or 1, inclusive 148 C edx [hamdist] src2 149 C 150 C mm0 total (low dword) 151 C mm1 (scratch) 152 C mm2 (scratch) 153 C mm3 154 C mm4 0x0000000000000000 155 C mm5 0x0F0F0F0F0F0F0F0F 156 C mm6 0x3333333333333333 157 C mm7 0xAAAAAAAAAAAAAAAA 158 159 movd (%eax), %mm1 160 movd 4(%eax), %mm2 161 punpckldq %mm2, %mm1 162 addl $8, %eax 163 164 HAM(` movd (%edx), %mm2 165 movd 4(%edx), %mm3 166 punpckldq %mm3, %mm2 167 pxor %mm2, %mm1 168 addl $8, %edx 169 ') 170 171 L(loaded): 172 movq %mm7, %mm2 173 pand %mm1, %mm2 174 psrlq $1, %mm2 175 psubd %mm2, %mm1 C bit pairs 176 177 movq %mm6, %mm2 178 pand %mm1, %mm2 179 psrlq $2, %mm1 180 pand %mm6, %mm1 181 paddd %mm2, %mm1 C nibbles 182 183 movq %mm5, %mm2 184 pand %mm1, %mm2 185 psrlq $4, %mm1 186 pand %mm5, %mm1 187 paddd %mm2, %mm1 C bytes 188 189 psadbw( %mm4, %mm1) 190 paddd %mm1, %mm0 C to total 191 192 subl $2, %ecx 193 jg L(top) 194 195 C ecx is 0 or -1 representing respectively 1 or 0 further limbs 196 jz L(last) 197 198 199 movd %mm0, %eax 200 emms 201 ret 202 203 EPILOGUE()