github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm/neon/hamdist.asm (about) 1 dnl ARM Neon mpn_hamdist -- mpn bit hamming distance. 2 3 dnl Copyright 2013 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C StrongARM: - 35 C XScale - 36 C Cortex-A7 ? 37 C Cortex-A8 ? 38 C Cortex-A9 1.89 39 C Cortex-A15 0.95 40 41 C TODO 42 C * Explore using vldr and vldm. Does it help on A9? (These loads do 43 C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for 44 C popcount. Except perhaps also for popcount for the edge loads.) 45 C * Arrange to align the pointer, if that helps performance. Use the same 46 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry 47 C valgrind!) 48 C * Explore if explicit align directives, e.g., "[ptr:128]" help. 49 C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. 50 51 C INPUT PARAMETERS 52 define(`ap', r0) 53 define(`bp', r1) 54 define(`n', r2) 55 56 C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end 57 C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or 58 C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which 59 C can be represented as a 8-bit ARM constant. 60 C 61 define(`chunksize',0x3f80) 62 63 ASM_START() 64 PROLOGUE(mpn_hamdist) 65 66 cmp n, #chunksize 67 bhi L(gt16k) 68 69 L(lt16k): 70 vmov.i64 q8, #0 C clear summation register 71 vmov.i64 q9, #0 C clear summation register 72 73 tst n, #1 74 beq L(xxx0) 75 vmov.i64 d0, #0 76 vmov.i64 d20, #0 77 sub n, n, #1 78 vld1.32 {d0[0]}, [ap]! C load 1 limb 79 vld1.32 {d20[0]}, [bp]! C load 1 limb 80 veor d0, d0, d20 81 vcnt.8 d24, d0 82 vpadal.u8 d16, d24 C d16/q8 = 0; could just splat 83 84 L(xxx0):tst n, #2 85 beq L(xx00) 86 sub n, n, #2 87 vld1.32 {d0}, [ap]! C load 2 limbs 88 vld1.32 {d20}, [bp]! C load 2 limbs 89 veor d0, d0, d20 90 vcnt.8 d24, d0 91 vpadal.u8 d16, d24 92 93 L(xx00):tst n, #4 94 beq L(x000) 95 sub n, n, #4 96 vld1.32 {q0}, [ap]! C load 4 limbs 97 vld1.32 {q10}, [bp]! C load 4 limbs 98 veor q0, q0, q10 99 vcnt.8 q12, q0 100 vpadal.u8 q8, q12 101 102 L(x000):tst n, #8 103 beq L(0000) 104 105 subs n, n, #8 106 vld1.32 {q0,q1}, [ap]! C load 8 limbs 107 vld1.32 {q10,q11}, [bp]! C load 8 limbs 108 bls L(sum) 109 110 L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs 111 vld1.32 {q14,q15}, [bp]! C load 8 limbs 112 veor q0, q0, q10 113 veor q1, q1, q11 114 sub n, n, #8 115 vcnt.8 q12, q0 116 vcnt.8 q13, q1 117 b L(mid) 118 119 L(0000):subs n, n, #16 120 blo L(e0) 121 122 vld1.32 {q2,q3}, [ap]! C load 8 limbs 123 vld1.32 {q0,q1}, [ap]! C load 8 limbs 124 vld1.32 {q14,q15}, [bp]! C load 8 limbs 125 vld1.32 {q10,q11}, [bp]! C load 8 limbs 126 veor q2, q2, q14 127 veor q3, q3, q15 128 vcnt.8 q12, q2 129 vcnt.8 q13, q3 130 subs n, n, #16 131 blo L(end) 132 133 L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs 134 vld1.32 {q14,q15}, [bp]! C load 8 limbs 135 veor q0, q0, q10 136 veor q1, q1, q11 137 vpadal.u8 q8, q12 138 vcnt.8 q12, q0 139 vpadal.u8 q9, q13 140 vcnt.8 q13, q1 141 L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs 142 vld1.32 {q10,q11}, [bp]! C load 8 limbs 143 veor q2, q2, q14 144 veor q3, q3, q15 145 subs n, n, #16 146 vpadal.u8 q8, q12 147 vcnt.8 q12, q2 148 vpadal.u8 q9, q13 149 vcnt.8 q13, q3 150 bhs L(top) 151 152 L(end): vpadal.u8 q8, q12 153 vpadal.u8 q9, q13 154 L(sum): veor q0, q0, q10 155 veor q1, q1, q11 156 vcnt.8 q12, q0 157 vcnt.8 q13, q1 158 vpadal.u8 q8, q12 159 vpadal.u8 q9, q13 160 vadd.i16 q8, q8, q9 161 C we have 8 16-bit counts 162 L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts 163 vpaddl.u32 q8, q8 C we have 2 64-bit counts 164 vmov.32 r0, d16[0] 165 vmov.32 r1, d17[0] 166 add r0, r0, r1 167 bx lr 168 169 C Code for large count. Splits operand and calls above code. 170 define(`ap2', r5) 171 define(`bp2', r6) 172 L(gt16k): 173 push {r4,r5,r6,r14} 174 mov ap2, ap 175 mov bp2, bp 176 mov r3, n C full count 177 mov r4, #0 C total sum 178 179 1: mov n, #chunksize C count for this invocation 180 bl L(lt16k) C could jump deep inside code 181 add ap2, ap2, #chunksize*4 C point at next chunk 182 add bp2, bp2, #chunksize*4 C point at next chunk 183 add r4, r4, r0 184 mov ap, ap2 C put chunk pointer in place for call 185 mov bp, bp2 C put chunk pointer in place for call 186 sub r3, r3, #chunksize 187 cmp r3, #chunksize 188 bhi 1b 189 190 mov n, r3 C count for final invocation 191 bl L(lt16k) 192 add r0, r4, r0 193 pop {r4,r5,r6,pc} 194 EPILOGUE()