github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/com.asm (about) 1 dnl Alpha mpn_com -- mpn one's complement. 2 3 dnl Copyright 2003 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C EV4: 4.75 36 C EV5: 2.0 37 C EV6: 1.5 38 39 40 C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); 41 C 42 C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total 43 C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop 44 C will be 1.5+2/N c/l. 45 C 46 C 2 cycles of loop control are unavoidable, for pointer updates and the 47 C taken branch bubble, but also since ldq cannot issue two cycles after stq 48 C (and with a run of stqs that means neither of two cycles at the end of the 49 C loop. 50 C 51 C The fbeq is forced into the second cycle of the loop using unops, since 52 C the first time through it must wait for the cvtqt result. Once that 53 C result is ready (a 1 cycle stall) then both the branch and following loads 54 C can issue together. 55 C 56 C The main loop handles an odd count of limbs, being two limbs loaded before 57 C each size test, plus one pipelined around from the previous iteration (or 58 C setup in the entry sequence). 59 C 60 C An even number of limbs is handled by an explicit dst[0]=~src[0] in the 61 C entry sequence, and an increment of the pointers. For an odd size there's 62 C no increment and the first store in the loop (r24) is a repeat of dst[0]. 63 C 64 C Note that the load for r24 after the possible pointer increment is done 65 C before the explicit store to dst[0], in case src==dst. 66 67 68 ASM_START() 69 70 FLOAT64(L(dat), 2.0) 71 72 ALIGN(16) 73 74 PROLOGUE(mpn_com,gp) 75 76 C r16 dst 77 C r17 src 78 C r18 size 79 80 lda r30, -16(r30) C temporary stack space 81 lda r7, -3(r18) C size - 3 82 83 ldq r20, 0(r17) C src[0] 84 srl r7, 1, r6 C (size-3)/2 85 86 stq r6, 8(r30) C (size-3)/2 87 and r7, 1, r5 C 1 if size even 88 89 LEA( r8, L(dat)) 90 s8addq r5, r17, r17 C skip src[0] if even 91 92 ornot r31, r20, r20 C ~src[0] 93 unop 94 95 ldt f0, 8(r30) C (size-3)/2 96 ldq r24, 0(r17) C src[0 or 1] 97 98 stq r20, 0(r16) C dst[0] 99 s8addq r5, r16, r19 C skip dst[0] if even 100 101 ldt f1, 0(r8) C data 2.0 102 lda r30, 16(r30) C restore stack 103 unop 104 cvtqt f0, f0 C (size-3)/2 as float 105 106 ornot r31, r24, r24 107 blt r7, L(done_1) C if size<=2 108 unop 109 unop 110 111 112 C 16-byte alignment here 113 L(top): 114 C r17 src, incrementing 115 C r19 dst, incrementing 116 C r24 dst[i] result, ready to store 117 C f0 (size-3)/2, decrementing 118 C f1 2.0 119 120 ldq r20, 8(r17) C src[i+1] 121 ldq r21, 16(r17) C src[i+2] 122 unop 123 unop 124 125 fbeq f0, L(done_2) 126 unop 127 ldq r22, 24(r17) C src[i+3] 128 ldq r23, 32(r17) C src[i+4] 129 130 stq r24, 0(r19) C dst[i] 131 ornot r31, r20, r20 132 subt f0, f1, f0 C count -= 2 133 unop 134 135 stq r20, 8(r19) C dst[i+1] 136 ornot r31, r21, r21 137 unop 138 unop 139 140 stq r21, 16(r19) C dst[i+2] 141 ornot r31, r22, r22 142 143 stq r22, 24(r19) C dst[i+3] 144 ornot r31, r23, r24 145 146 lda r17, 32(r17) C src += 4 147 lda r19, 32(r19) C dst += 4 148 unop 149 fbge f0, L(top) 150 151 152 L(done_1): 153 C r19 &dst[size-1] 154 C r24 result for dst[size-1] 155 156 stq r24, 0(r19) C dst[size-1] 157 ret r31, (r26), 1 158 159 160 L(done_2): 161 C r19 &dst[size-3] 162 C r20 src[size-2] 163 C r21 src[size-1] 164 C r24 result for dst[size-3] 165 166 stq r24, 0(r19) C dst[size-3] 167 ornot r31, r20, r20 168 169 stq r20, 8(r19) C dst[size-2] 170 ornot r31, r21, r21 171 172 stq r21, 16(r19) C dst[size-1] 173 ret r31, (r26), 1 174 175 EPILOGUE() 176 ASM_END()