github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm/neon/sec_tabselect.asm (about) 1 dnl ARM Neon mpn_sec_tabselect. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2011-2013 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C cycles/limb 37 C StrongARM - 38 C XScale - 39 C Cortex-A7 ? 40 C Cortex-A8 ? 41 C Cortex-A9 1.15 42 C Cortex-A15 0.65 43 44 define(`rp', `r0') 45 define(`tp', `r1') 46 define(`n', `r2') 47 define(`nents', `r3') 48 C define(`which', on stack) 49 50 define(`i', `r4') 51 define(`j', `r5') 52 53 define(`maskq', `q10') 54 define(`maskd', `d20') 55 56 ASM_START() 57 PROLOGUE(mpn_sec_tabselect) 58 push {r4-r5} 59 60 add r4, sp, #8 61 vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies 62 vmov.i32 q14, #1 C 4 copies of 1 63 64 subs j, n, #8 65 bmi L(outer_end) 66 67 L(outer_top): 68 mov i, nents 69 mov r12, tp C preserve tp 70 veor q13, q13, q13 C 4 counter copies 71 veor q2, q2, q2 72 veor q3, q3, q3 73 ALIGN(16) 74 L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies 75 vld1.32 {q0,q1}, [tp] 76 vadd.i32 q13, q13, q14 77 vbit q2, q0, maskq 78 vbit q3, q1, maskq 79 add tp, tp, n, lsl #2 80 subs i, i, #1 81 bne L(top) 82 vst1.32 {q2,q3}, [rp]! 83 add tp, r12, #32 C restore tp, point to next slice 84 subs j, j, #8 85 bpl L(outer_top) 86 L(outer_end): 87 88 tst n, #4 89 beq L(b0xx) 90 L(b1xx):mov i, nents 91 mov r12, tp 92 veor q13, q13, q13 93 veor q2, q2, q2 94 ALIGN(16) 95 L(tp4): vceq.i32 maskq, q13, q15 96 vld1.32 {q0}, [tp] 97 vadd.i32 q13, q13, q14 98 vbit q2, q0, maskq 99 add tp, tp, n, lsl #2 100 subs i, i, #1 101 bne L(tp4) 102 vst1.32 {q2}, [rp]! 103 add tp, r12, #16 104 105 L(b0xx):tst n, #2 106 beq L(b00x) 107 L(b01x):mov i, nents 108 mov r12, tp 109 veor d26, d26, d26 110 veor d4, d4, d4 111 ALIGN(16) 112 L(tp2): vceq.i32 maskd, d26, d30 113 vld1.32 {d0}, [tp] 114 vadd.i32 d26, d26, d28 115 vbit d4, d0, maskd 116 add tp, tp, n, lsl #2 117 subs i, i, #1 118 bne L(tp2) 119 vst1.32 {d4}, [rp]! 120 add tp, r12, #8 121 122 L(b00x):tst n, #1 123 beq L(b000) 124 L(b001):mov i, nents 125 mov r12, tp 126 veor d26, d26, d26 127 veor d4, d4, d4 128 ALIGN(16) 129 L(tp1): vceq.i32 maskd, d26, d30 130 vld1.32 {d0[0]}, [tp] 131 vadd.i32 d26, d26, d28 132 vbit d4, d0, maskd 133 add tp, tp, n, lsl #2 134 subs i, i, #1 135 bne L(tp1) 136 vst1.32 {d4[0]}, [rp] 137 138 L(b000):pop {r4-r5} 139 bx r14 140 EPILOGUE()