github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm64/hamdist.asm (about) 1 dnl ARM64 Neon mpn_hamdist -- mpn bit hamming distance. 2 3 dnl Copyright 2013, 2014 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C Cortex-A53 ? 35 C Cortex-A57 ? 36 37 C TODO 38 C * Consider greater unrolling. 39 C * Arrange to align the pointer, if that helps performance. Use the same 40 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry 41 C valgrind!) 42 C * Explore if explicit align directives, e.g., "[ptr:128]" help. 43 C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. 44 45 changecom(@&*$) 46 47 C INPUT PARAMETERS 48 define(`ap', x0) 49 define(`bp', x1) 50 define(`n', x2) 51 52 C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end 53 C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or 54 C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which 55 C allows the huge count code to jump deep into the code (at L(chu)). 56 57 define(`maxsize', 0x1fff) 58 define(`chunksize',0x1ff0) 59 60 ASM_START() 61 PROLOGUE(mpn_hamdist) 62 63 mov x11, #maxsize 64 cmp n, x11 65 b.hi L(gt8k) 66 67 L(lt8k): 68 movi v4.16b, #0 C clear summation register 69 movi v5.16b, #0 C clear summation register 70 71 tbz n, #0, L(xx0) 72 sub n, n, #1 73 ld1 {v0.1d}, [ap], #8 C load 1 limb 74 ld1 {v16.1d}, [bp], #8 C load 1 limb 75 eor v0.16b, v0.16b, v16.16b 76 cnt v6.16b, v0.16b 77 uadalp v4.8h, v6.16b C could also splat 78 79 L(xx0): tbz n, #1, L(x00) 80 sub n, n, #2 81 ld1 {v0.2d}, [ap], #16 C load 2 limbs 82 ld1 {v16.2d}, [bp], #16 C load 2 limbs 83 eor v0.16b, v0.16b, v16.16b 84 cnt v6.16b, v0.16b 85 uadalp v4.8h, v6.16b 86 87 L(x00): tbz n, #2, L(000) 88 subs n, n, #4 89 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 90 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 91 b.ls L(sum) 92 93 L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 94 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 95 eor v0.16b, v0.16b, v16.16b 96 eor v1.16b, v1.16b, v17.16b 97 sub n, n, #4 98 cnt v6.16b, v0.16b 99 cnt v7.16b, v1.16b 100 b L(mid) 101 102 L(000): subs n, n, #8 103 b.lo L(e0) 104 105 L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 106 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 107 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 108 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 109 eor v2.16b, v2.16b, v18.16b 110 eor v3.16b, v3.16b, v19.16b 111 cnt v6.16b, v2.16b 112 cnt v7.16b, v3.16b 113 subs n, n, #8 114 b.lo L(end) 115 116 L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 117 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 118 eor v0.16b, v0.16b, v16.16b 119 eor v1.16b, v1.16b, v17.16b 120 uadalp v4.8h, v6.16b 121 cnt v6.16b, v0.16b 122 uadalp v5.8h, v7.16b 123 cnt v7.16b, v1.16b 124 L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 125 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 126 eor v2.16b, v2.16b, v18.16b 127 eor v3.16b, v3.16b, v19.16b 128 subs n, n, #8 129 uadalp v4.8h, v6.16b 130 cnt v6.16b, v2.16b 131 uadalp v5.8h, v7.16b 132 cnt v7.16b, v3.16b 133 b.hs L(top) 134 135 L(end): uadalp v4.8h, v6.16b 136 uadalp v5.8h, v7.16b 137 L(sum): eor v0.16b, v0.16b, v16.16b 138 eor v1.16b, v1.16b, v17.16b 139 cnt v6.16b, v0.16b 140 cnt v7.16b, v1.16b 141 uadalp v4.8h, v6.16b 142 uadalp v5.8h, v7.16b 143 add v4.8h, v4.8h, v5.8h 144 C we have 8 16-bit counts 145 L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts 146 uaddlp v4.2d, v4.4s C we have 2 64-bit counts 147 mov x0, v4.d[0] 148 mov x1, v4.d[1] 149 add x0, x0, x1 150 ret 151 152 C Code for count > maxsize. Splits operand and calls above code. 153 define(`ap2', x5) C caller-saves reg not used above 154 define(`bp2', x6) C caller-saves reg not used above 155 L(gt8k): 156 mov x8, x30 157 mov x7, n C full count (caller-saves reg not used above) 158 mov x4, #0 C total sum (caller-saves reg not used above) 159 mov x9, #chunksize*8 C caller-saves reg not used above 160 mov x10, #chunksize C caller-saves reg not used above 161 162 1: add ap2, ap, x9 C point at subsequent block 163 add bp2, bp, x9 C point at subsequent block 164 mov n, #chunksize-8 C count for this invocation, adjusted for entry pt 165 movi v4.16b, #0 C clear chunk summation register 166 movi v5.16b, #0 C clear chunk summation register 167 bl L(chu) C jump deep inside code 168 add x4, x4, x0 169 mov ap, ap2 C put chunk pointer in place for calls 170 mov bp, bp2 C put chunk pointer in place for calls 171 sub x7, x7, x10 172 cmp x7, x11 173 b.hi 1b 174 175 mov n, x7 C count for final invocation 176 bl L(lt8k) 177 add x0, x4, x0 178 mov x30, x8 179 ret 180 EPILOGUE()