github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/pa64/sqr_diagonal.asm (about) 1 dnl HP-PA 2.0 64-bit mpn_sqr_diagonal. 2 3 dnl Copyright 2001-2003 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 32 dnl This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on 33 dnl PA8500. The cache would saturate at 5 cycles/limb, so there is some room 34 dnl for optimization. 35 36 include(`../config.m4') 37 38 C INPUT PARAMETERS 39 define(`rp',`%r26') 40 define(`up',`%r25') 41 define(`n',`%r24') 42 43 define(`p00',`%r28') 44 define(`p32',`%r29') 45 define(`p64',`%r31') 46 define(`t0',`%r19') 47 define(`t1',`%r20') 48 49 ifdef(`HAVE_ABI_2_0w', 50 ` .level 2.0w 51 ',` .level 2.0 52 ') 53 PROLOGUE(mpn_sqr_diagonal) 54 ldo 128(%r30),%r30 55 56 fldds,ma 8(up),%fr8 57 addib,= -1,n,L(end1) 58 nop 59 fldds,ma 8(up),%fr4 60 xmpyu %fr8l,%fr8r,%fr10 61 fstd %fr10,-120(%r30) 62 xmpyu %fr8r,%fr8r,%fr9 63 fstd %fr9,0(rp) 64 xmpyu %fr8l,%fr8l,%fr11 65 fstd %fr11,8(rp) 66 addib,= -1,n,L(end2) 67 ldo 16(rp),rp 68 69 LDEF(loop) 70 fldds,ma 8(up),%fr8 C load next up limb 71 xmpyu %fr4l,%fr4r,%fr6 72 fstd %fr6,-128(%r30) 73 xmpyu %fr4r,%fr4r,%fr5 C multiply in fp regs 74 fstd %fr5,0(rp) 75 xmpyu %fr4l,%fr4l,%fr7 76 fstd %fr7,8(rp) 77 ldd -120(%r30),p32 78 ldd -16(rp),p00 C accumulate in int regs 79 ldd -8(rp),p64 80 depd,z p32,30,31,t0 81 add t0,p00,p00 82 std p00,-16(rp) 83 extrd,u p32,32,33,t1 84 add,dc t1,p64,p64 85 std p64,-8(rp) 86 addib,= -1,n,L(exit) 87 ldo 16(rp),rp 88 89 fldds,ma 8(up),%fr4 90 xmpyu %fr8l,%fr8r,%fr10 91 fstd %fr10,-120(%r30) 92 xmpyu %fr8r,%fr8r,%fr9 93 fstd %fr9,0(rp) 94 xmpyu %fr8l,%fr8l,%fr11 95 fstd %fr11,8(rp) 96 ldd -128(%r30),p32 97 ldd -16(rp),p00 98 ldd -8(rp),p64 99 depd,z p32,30,31,t0 100 add t0,p00,p00 101 std p00,-16(rp) 102 extrd,u p32,32,33,t1 103 add,dc t1,p64,p64 104 std p64,-8(rp) 105 addib,<> -1,n,L(loop) 106 ldo 16(rp),rp 107 108 LDEF(end2) 109 xmpyu %fr4l,%fr4r,%fr6 110 fstd %fr6,-128(%r30) 111 xmpyu %fr4r,%fr4r,%fr5 112 fstd %fr5,0(rp) 113 xmpyu %fr4l,%fr4l,%fr7 114 fstd %fr7,8(rp) 115 ldd -120(%r30),p32 116 ldd -16(rp),p00 117 ldd -8(rp),p64 118 depd,z p32,30,31,t0 119 add t0,p00,p00 120 std p00,-16(rp) 121 extrd,u p32,32,33,t1 122 add,dc t1,p64,p64 123 std p64,-8(rp) 124 ldo 16(rp),rp 125 ldd -128(%r30),p32 126 ldd -16(rp),p00 127 ldd -8(rp),p64 128 depd,z p32,30,31,t0 129 add t0,p00,p00 130 std p00,-16(rp) 131 extrd,u p32,32,33,t1 132 add,dc t1,p64,p64 133 std p64,-8(rp) 134 bve (%r2) 135 ldo -128(%r30),%r30 136 137 LDEF(exit) 138 xmpyu %fr8l,%fr8r,%fr10 139 fstd %fr10,-120(%r30) 140 xmpyu %fr8r,%fr8r,%fr9 141 fstd %fr9,0(rp) 142 xmpyu %fr8l,%fr8l,%fr11 143 fstd %fr11,8(rp) 144 ldd -128(%r30),p32 145 ldd -16(rp),p00 146 ldd -8(rp),p64 147 depd,z p32,31,32,t0 148 add t0,p00,p00 149 extrd,u p32,31,32,t1 150 add,dc t1,p64,p64 151 add t0,p00,p00 152 add,dc t1,p64,p64 153 std p00,-16(rp) 154 std p64,-8(rp) 155 ldo 16(rp),rp 156 ldd -120(%r30),p32 157 ldd -16(rp),p00 158 ldd -8(rp),p64 159 depd,z p32,31,32,t0 160 add t0,p00,p00 161 extrd,u p32,31,32,t1 162 add,dc t1,p64,p64 163 add t0,p00,p00 164 add,dc t1,p64,p64 165 std p00,-16(rp) 166 std p64,-8(rp) 167 bve (%r2) 168 ldo -128(%r30),%r30 169 170 LDEF(end1) 171 xmpyu %fr8l,%fr8r,%fr10 172 fstd %fr10,-128(%r30) 173 xmpyu %fr8r,%fr8r,%fr9 174 fstd %fr9,0(rp) 175 xmpyu %fr8l,%fr8l,%fr11 176 fstd %fr11,8(rp) 177 ldo 16(rp),rp 178 ldd -128(%r30),p32 179 ldd -16(rp),p00 180 ldd -8(rp),p64 181 depd,z p32,31,32,t0 182 add t0,p00,p00 183 extrd,u p32,31,32,t1 184 add,dc t1,p64,p64 185 add t0,p00,p00 186 add,dc t1,p64,p64 187 std p00,-16(rp) 188 std p64,-8(rp) 189 bve (%r2) 190 ldo -128(%r30),%r30 191 EPILOGUE(mpn_sqr_diagonal)