github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/vmx/popcount.asm (about) 1 dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount. 2 3 dnl Copyright 2006, 2010 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C 7400,7410 (G4): ? 35 C 744x,745x (G4+): 1.125 36 C 970 (G5): 2.25 37 38 C TODO 39 C * Rewrite the awkward huge n outer loop code. 40 C * Two lvx, two vperm, and two vxor could make us a similar hamdist. 41 C * Compress cnsts table in 64-bit mode, only half the values are needed. 42 43 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 44 define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 45 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 46 47 define(`OPERATION_popcount') 48 49 define(`ap', `r3') 50 define(`n', `r4') 51 52 define(`rtab', `v10') 53 define(`cnt4', `v11') 54 55 ifelse(GMP_LIMB_BITS,32,` 56 define(`LIMB32',` $1') 57 define(`LIMB64',`') 58 ',` 59 define(`LIMB32',`') 60 define(`LIMB64',` $1') 61 ') 62 63 C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow 64 C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK. 65 define(`LIMBS_PER_CHUNK', 0x1000) 66 define(`LIMBS_CHUNK_THRES', 0x1001) 67 68 ASM_START() 69 PROLOGUE(mpn_popcount,toc) 70 mfspr r10, 256 71 oris r0, r10, 0xfffc C Set VRSAVE bit 0-13 72 mtspr 256, r0 73 74 ifdef(`HAVE_ABI_mode32', 75 ` rldicl n, n, 0, 32') C zero extend n 76 77 C Load various constants into vector registers 78 LEAL( r11, cnsts) 79 li r12, 16 80 vspltisb cnt4, 4 C 0x0404...04 used as shift count 81 82 li r7, 160 83 lvx rtab, 0, r11 84 85 LIMB64(`lis r0, LIMBS_CHUNK_THRES ') 86 LIMB64(`cmpd cr7, n, r0 ') 87 88 lvx v0, 0, ap 89 addi r7, r11, 80 90 rlwinm r6, ap, 2,26,29 91 lvx v8, r7, r6 92 vand v0, v0, v8 93 94 LIMB32(`rlwinm r8, ap, 30,30,31 ') 95 LIMB64(`rlwinm r8, ap, 29,31,31 ') 96 add n, n, r8 C compensate n for rounded down `ap' 97 98 vxor v1, v1, v1 99 li r8, 0 C grand total count 100 101 vxor v12, v12, v12 C zero total count 102 vxor v13, v13, v13 C zero total count 103 104 addic. n, n, -LIMBS_PER_VR 105 ble L(sum) 106 107 addic. n, n, -LIMBS_PER_VR 108 ble L(lsum) 109 110 C For 64-bit machines, handle huge n that would overflow vsum4ubs 111 LIMB64(`ble cr7, L(small) ') 112 LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n 113 LIMB64(`lis n, LIMBS_PER_CHUNK ') 114 115 ALIGN(16) 116 L(small): 117 LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n 118 LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n 119 addi r7, r7, 1 120 mtctr r7 C copy n to count register 121 b L(ent) 122 123 ALIGN(16) 124 L(top): 125 lvx v0, 0, ap 126 L(ent): lvx v1, r12, ap 127 addi ap, ap, 32 128 vsrb v8, v0, cnt4 129 vsrb v9, v1, cnt4 130 vperm v2, rtab, rtab, v0 131 vperm v3, rtab, rtab, v8 132 vperm v4, rtab, rtab, v1 133 vperm v5, rtab, rtab, v9 134 vaddubm v6, v2, v3 135 vaddubm v7, v4, v5 136 vsum4ubs v12, v6, v12 137 vsum4ubs v13, v7, v13 138 bdnz L(top) 139 140 andi. n, n, eval(LIMBS_PER_2VR-1) 141 beq L(rt) 142 143 lvx v0, 0, ap 144 vxor v1, v1, v1 145 cmpwi n, LIMBS_PER_VR 146 ble L(sum) 147 L(lsum): 148 vor v1, v0, v0 149 lvx v0, r12, ap 150 L(sum): 151 LIMB32(`rlwinm r6, n, 4,26,27 ') 152 LIMB64(`rlwinm r6, n, 5,26,26 ') 153 addi r7, r11, 16 154 lvx v8, r7, r6 155 vand v0, v0, v8 156 vsrb v8, v0, cnt4 157 vsrb v9, v1, cnt4 158 vperm v2, rtab, rtab, v0 159 vperm v3, rtab, rtab, v8 160 vperm v4, rtab, rtab, v1 161 vperm v5, rtab, rtab, v9 162 vaddubm v6, v2, v3 163 vaddubm v7, v4, v5 164 vsum4ubs v12, v6, v12 165 vsum4ubs v13, v7, v13 166 167 ALIGN(16) 168 L(rt): vadduwm v3, v12, v13 169 li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs 170 stvx v3, r7, r1 C FIXME: ...support storing below sp? 171 172 lwz r7, -16(r1) 173 add r8, r8, r7 174 lwz r7, -12(r1) 175 add r8, r8, r7 176 lwz r7, -8(r1) 177 add r8, r8, r7 178 lwz r7, -4(r1) 179 add r8, r8, r7 180 181 C Handle outer loop for huge n. We inherit cr7 and r0 from above. 182 LIMB64(`ble cr7, L(ret) 183 vxor v12, v12, v12 C zero total count 184 vxor v13, v13, v13 C zero total count 185 mr n, r9 186 cmpd cr7, n, r0 187 ble cr7, L(2) 188 addis r9, n, -LIMBS_PER_CHUNK C remaining n 189 lis n, LIMBS_PER_CHUNK 190 L(2): srdi r7, n, 2 C loop count corresponding to n 191 mtctr r7 C copy n to count register 192 b L(top) 193 ') 194 195 ALIGN(16) 196 L(ret): mr r3, r8 197 mtspr 256, r10 198 blr 199 EPILOGUE() 200 201 DEF_OBJECT(cnsts,16) 202 C Counts for vperm 203 .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 204 .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 205 C Masks for high end of number 206 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 207 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 208 209 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 210 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 211 212 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 213 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 214 215 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 216 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 217 C Masks for low end of number 218 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 219 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 220 221 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 222 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 223 224 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 225 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 226 227 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 228 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 229 END_OBJECT(cnsts) 230 ASM_END()