github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/popham.asm (about) 1 dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance. 2 3 dnl Copyright 2004, 2005, 2007, 2010-2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 32 include(`../config.m4') 33 34 35 C popcount hamdist 36 C cycles/limb cycles/limb 37 C AMD K8,K9 6 7 38 C AMD K10 6 7 39 C Intel P4 12 14.3 40 C Intel core2 7 8 41 C Intel corei ? 7.3 42 C Intel atom 16.5 17.5 43 C VIA nano 8.75 10.4 44 45 C TODO 46 C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for 47 C hamdist for K8/K9. 48 49 50 ifdef(`OPERATION_popcount',` 51 define(`func',`mpn_popcount') 52 define(`up', `%rdi') 53 define(`n', `%rsi') 54 define(`h55555555', `%r10') 55 define(`h33333333', `%r11') 56 define(`h0f0f0f0f', `%rcx') 57 define(`h01010101', `%rdx') 58 define(`POP', `$1') 59 define(`HAM', `dnl') 60 ') 61 ifdef(`OPERATION_hamdist',` 62 define(`func',`mpn_hamdist') 63 define(`up', `%rdi') 64 define(`vp', `%rsi') 65 define(`n', `%rdx') 66 define(`h55555555', `%r10') 67 define(`h33333333', `%r11') 68 define(`h0f0f0f0f', `%rcx') 69 define(`h01010101', `%r14') 70 define(`POP', `dnl') 71 define(`HAM', `$1') 72 ') 73 74 75 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 76 77 ABI_SUPPORT(DOS64) 78 ABI_SUPPORT(STD64) 79 80 ASM_START() 81 TEXT 82 ALIGN(32) 83 PROLOGUE(func) 84 POP(` FUNC_ENTRY(2) ') 85 HAM(` FUNC_ENTRY(3) ') 86 push %r12 87 push %r13 88 HAM(` push %r14 ') 89 90 mov $0x5555555555555555, h55555555 91 mov $0x3333333333333333, h33333333 92 mov $0x0f0f0f0f0f0f0f0f, h0f0f0f0f 93 mov $0x0101010101010101, h01010101 94 95 lea (up,n,8), up 96 HAM(` lea (vp,n,8), vp ') 97 neg n 98 99 xor R32(%rax), R32(%rax) 100 101 bt $0, R32(n) 102 jnc L(top) 103 104 mov (up,n,8), %r8 105 HAM(` xor (vp,n,8), %r8 ') 106 107 mov %r8, %r9 108 shr %r8 109 and h55555555, %r8 110 sub %r8, %r9 111 112 mov %r9, %r8 113 shr $2, %r9 114 and h33333333, %r8 115 and h33333333, %r9 116 add %r8, %r9 C 16 4-bit fields (0..4) 117 118 mov %r9, %r8 119 shr $4, %r9 120 and h0f0f0f0f, %r8 121 and h0f0f0f0f, %r9 122 add %r8, %r9 C 8 8-bit fields (0..16) 123 124 imul h01010101, %r9 C sum the 8 fields in high 8 bits 125 shr $56, %r9 126 127 mov %r9, %rax C add to total 128 add $1, n 129 jz L(end) 130 131 ALIGN(16) 132 L(top): mov (up,n,8), %r8 133 mov 8(up,n,8), %r12 134 HAM(` xor (vp,n,8), %r8 ') 135 HAM(` xor 8(vp,n,8), %r12 ') 136 137 mov %r8, %r9 138 mov %r12, %r13 139 shr %r8 140 shr %r12 141 and h55555555, %r8 142 and h55555555, %r12 143 sub %r8, %r9 144 sub %r12, %r13 145 146 mov %r9, %r8 147 mov %r13, %r12 148 shr $2, %r9 149 shr $2, %r13 150 and h33333333, %r8 151 and h33333333, %r9 152 and h33333333, %r12 153 and h33333333, %r13 154 add %r8, %r9 C 16 4-bit fields (0..4) 155 add %r12, %r13 C 16 4-bit fields (0..4) 156 157 add %r13, %r9 C 16 4-bit fields (0..8) 158 mov %r9, %r8 159 shr $4, %r9 160 and h0f0f0f0f, %r8 161 and h0f0f0f0f, %r9 162 add %r8, %r9 C 8 8-bit fields (0..16) 163 164 imul h01010101, %r9 C sum the 8 fields in high 8 bits 165 shr $56, %r9 166 167 add %r9, %rax C add to total 168 add $2, n 169 jnc L(top) 170 171 L(end): 172 HAM(` pop %r14 ') 173 pop %r13 174 pop %r12 175 FUNC_EXIT() 176 ret 177 EPILOGUE()