github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm64/popcount.asm (about) 1 dnl ARM64 Neon mpn_popcount -- mpn bit population count. 2 3 dnl Copyright 2013, 2014 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C Cortex-A53 ? 35 C Cortex-A57 ? 36 37 C TODO 38 C * Consider greater unrolling. 39 C * Arrange to align the pointer, if that helps performance. Use the same 40 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry 41 C valgrind!) 42 C * Explore if explicit align directives, e.g., "[ptr:128]" help. 43 C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. 44 45 changecom(@&*$) 46 47 C INPUT PARAMETERS 48 define(`ap', x0) 49 define(`n', x1) 50 51 C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end 52 C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or 53 C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which 54 C allows the huge count code to jump deep into the code (at L(chu)). 55 56 define(`maxsize', 0x1fff) 57 define(`chunksize',0x1ff0) 58 59 ASM_START() 60 PROLOGUE(mpn_popcount) 61 62 mov x11, #maxsize 63 cmp n, x11 64 b.hi L(gt8k) 65 66 L(lt8k): 67 movi v4.16b, #0 C clear summation register 68 movi v5.16b, #0 C clear summation register 69 70 tbz n, #0, L(xx0) 71 sub n, n, #1 72 ld1 {v0.1d}, [ap], #8 C load 1 limb 73 cnt v6.16b, v0.16b 74 uadalp v4.8h, v6.16b C could also splat 75 76 L(xx0): tbz n, #1, L(x00) 77 sub n, n, #2 78 ld1 {v0.2d}, [ap], #16 C load 2 limbs 79 cnt v6.16b, v0.16b 80 uadalp v4.8h, v6.16b 81 82 L(x00): tbz n, #2, L(000) 83 subs n, n, #4 84 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 85 b.ls L(sum) 86 87 L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 88 sub n, n, #4 89 cnt v6.16b, v0.16b 90 cnt v7.16b, v1.16b 91 b L(mid) 92 93 L(000): subs n, n, #8 94 b.lo L(e0) 95 96 L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 97 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 98 cnt v6.16b, v2.16b 99 cnt v7.16b, v3.16b 100 subs n, n, #8 101 b.lo L(end) 102 103 L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 104 uadalp v4.8h, v6.16b 105 cnt v6.16b, v0.16b 106 uadalp v5.8h, v7.16b 107 cnt v7.16b, v1.16b 108 L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 109 subs n, n, #8 110 uadalp v4.8h, v6.16b 111 cnt v6.16b, v2.16b 112 uadalp v5.8h, v7.16b 113 cnt v7.16b, v3.16b 114 b.hs L(top) 115 116 L(end): uadalp v4.8h, v6.16b 117 uadalp v5.8h, v7.16b 118 L(sum): cnt v6.16b, v0.16b 119 cnt v7.16b, v1.16b 120 uadalp v4.8h, v6.16b 121 uadalp v5.8h, v7.16b 122 add v4.8h, v4.8h, v5.8h 123 C we have 8 16-bit counts 124 L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts 125 uaddlp v4.2d, v4.4s C we have 2 64-bit counts 126 mov x0, v4.d[0] 127 mov x1, v4.d[1] 128 add x0, x0, x1 129 ret 130 131 C Code for count > maxsize. Splits operand and calls above code. 132 define(`ap2', x5) C caller-saves reg not used above 133 L(gt8k): 134 mov x8, x30 135 mov x7, n C full count (caller-saves reg not used above) 136 mov x4, #0 C total sum (caller-saves reg not used above) 137 mov x9, #chunksize*8 C caller-saves reg not used above 138 mov x10, #chunksize C caller-saves reg not used above 139 140 1: add ap2, ap, x9 C point at subsequent block 141 mov n, #chunksize-8 C count for this invocation, adjusted for entry pt 142 movi v4.16b, #0 C clear chunk summation register 143 movi v5.16b, #0 C clear chunk summation register 144 bl L(chu) C jump deep inside code 145 add x4, x4, x0 146 mov ap, ap2 C put chunk pointer in place for calls 147 sub x7, x7, x10 148 cmp x7, x11 149 b.hi 1b 150 151 mov n, x7 C count for final invocation 152 bl L(lt8k) 153 add x0, x4, x0 154 mov x30, x8 155 ret 156 EPILOGUE()