github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm/neon/popcount.asm (about) 1 dnl ARM Neon mpn_popcount -- mpn bit population count. 2 3 dnl Copyright 2013 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C StrongARM: - 35 C XScale - 36 C Cortex-A7 ? 37 C Cortex-A8 ? 38 C Cortex-A9 1.125 39 C Cortex-A15 0.56 40 41 C TODO 42 C * Explore using vldr and vldm. Does it help on A9? (These loads do 43 C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for 44 C popcount. Except perhaps also for popcount for the edge loads.) 45 C * Arrange to align the pointer, if that helps performance. Use the same 46 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry 47 C valgrind!) 48 C * Explore if explicit align directives, e.g., "[ptr:128]" help. 49 C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. 50 51 C INPUT PARAMETERS 52 define(`ap', r0) 53 define(`n', r1) 54 55 C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end 56 C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or 57 C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which 58 C can be represented as a 8-bit ARM constant. 59 C 60 define(`chunksize',0x3f80) 61 62 ASM_START() 63 PROLOGUE(mpn_popcount) 64 65 cmp n, #chunksize 66 bhi L(gt16k) 67 68 L(lt16k): 69 vmov.i64 q8, #0 C clear summation register 70 vmov.i64 q9, #0 C clear summation register 71 72 tst n, #1 73 beq L(xxx0) 74 vmov.i64 d0, #0 75 sub n, n, #1 76 vld1.32 {d0[0]}, [ap]! C load 1 limb 77 vcnt.8 d24, d0 78 vpadal.u8 d16, d24 C d16/q8 = 0; could just splat 79 80 L(xxx0):tst n, #2 81 beq L(xx00) 82 sub n, n, #2 83 vld1.32 {d0}, [ap]! C load 2 limbs 84 vcnt.8 d24, d0 85 vpadal.u8 d16, d24 86 87 L(xx00):tst n, #4 88 beq L(x000) 89 sub n, n, #4 90 vld1.32 {q0}, [ap]! C load 4 limbs 91 vcnt.8 q12, q0 92 vpadal.u8 q8, q12 93 94 L(x000):tst n, #8 95 beq L(0000) 96 97 subs n, n, #8 98 vld1.32 {q0,q1}, [ap]! C load 8 limbs 99 bls L(sum) 100 101 L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs 102 sub n, n, #8 103 vcnt.8 q12, q0 104 vcnt.8 q13, q1 105 b L(mid) 106 107 L(0000):subs n, n, #16 108 blo L(e0) 109 110 vld1.32 {q2,q3}, [ap]! C load 8 limbs 111 vld1.32 {q0,q1}, [ap]! C load 8 limbs 112 vcnt.8 q12, q2 113 vcnt.8 q13, q3 114 subs n, n, #16 115 blo L(end) 116 117 L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs 118 vpadal.u8 q8, q12 119 vcnt.8 q12, q0 120 vpadal.u8 q9, q13 121 vcnt.8 q13, q1 122 L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs 123 subs n, n, #16 124 vpadal.u8 q8, q12 125 vcnt.8 q12, q2 126 vpadal.u8 q9, q13 127 vcnt.8 q13, q3 128 bhs L(top) 129 130 L(end): vpadal.u8 q8, q12 131 vpadal.u8 q9, q13 132 L(sum): vcnt.8 q12, q0 133 vcnt.8 q13, q1 134 vpadal.u8 q8, q12 135 vpadal.u8 q9, q13 136 vadd.i16 q8, q8, q9 137 C we have 8 16-bit counts 138 L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts 139 vpaddl.u32 q8, q8 C we have 2 64-bit counts 140 vmov.32 r0, d16[0] 141 vmov.32 r1, d17[0] 142 add r0, r0, r1 143 bx lr 144 145 C Code for large count. Splits operand and calls above code. 146 define(`ap2', r2) C caller-saves reg not used above 147 L(gt16k): 148 push {r4,r14} 149 mov ap2, ap 150 mov r3, n C full count 151 mov r4, #0 C total sum 152 153 1: mov n, #chunksize C count for this invocation 154 bl L(lt16k) C could jump deep inside code 155 add ap2, ap2, #chunksize*4 C point at next chunk 156 add r4, r4, r0 157 mov ap, ap2 C put chunk pointer in place for call 158 sub r3, r3, #chunksize 159 cmp r3, #chunksize 160 bhi 1b 161 162 mov n, r3 C count for final invocation 163 bl L(lt16k) 164 add r0, r4, r0 165 pop {r4,pc} 166 EPILOGUE()