github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/s390_64/sqr_basecase.asm (about) 1 dnl S/390-64 mpn_sqr_basecase. 2 3 dnl Copyright 2011 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C z900 ? 35 C z990 23 36 C z9 ? 37 C z10 28 38 C z196 ? 39 40 C TODO 41 C * Clean up. 42 C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail. 43 C This will ask for basecase handling of n = 3. 44 C * Update counters and pointers more straightforwardly, possibly lowering 45 C register usage. 46 C * Should we use this allocation-free style for more sqr_basecase asm 47 C implementations? The only disadvantage is that it requires R != U. 48 C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped 49 C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even 50 C more. 51 52 C INPUT PARAMETERS 53 define(`rp', `%r2') 54 define(`up', `%r3') 55 define(`n', `%r4') 56 57 define(`zero', `%r8') 58 define(`rp_saved', `%r9') 59 define(`up_saved', `%r13') 60 define(`n_saved', `%r14') 61 62 ASM_START() 63 PROLOGUE(mpn_sqr_basecase) 64 aghi n, -2 65 jhe L(ge2) 66 67 C n = 1 68 lg %r5, 0(up) 69 mlgr %r4, %r5 70 stg %r5, 0(rp) 71 stg %r4, 8(rp) 72 br %r14 73 74 L(ge2): jne L(gen) 75 76 C n = 2 77 stmg %r6, %r8, 48(%r15) 78 lghi zero, 0 79 80 lg %r5, 0(up) 81 mlgr %r4, %r5 C u0 * u0 82 lg %r1, 8(up) 83 mlgr %r0, %r1 C u1 * u1 84 stg %r5, 0(rp) 85 86 lg %r7, 0(up) 87 mlg %r6, 8(up) C u0 * u1 88 algr %r7, %r7 89 alcgr %r6, %r6 90 alcgr %r0, zero 91 92 algr %r4, %r7 93 alcgr %r1, %r6 94 alcgr %r0, zero 95 stg %r4, 8(rp) 96 stg %r1, 16(rp) 97 stg %r0, 24(rp) 98 99 lmg %r6, %r8, 48(%r15) 100 br %r14 101 102 L(gen): 103 C mul_1 ======================================================================= 104 105 stmg %r6, %r14, 48(%r15) 106 lghi zero, 0 107 lgr up_saved, up 108 lgr rp_saved, rp 109 lgr n_saved, n 110 111 lg %r6, 0(up) 112 lg %r11, 8(up) 113 lghi %r12, 16 C init index register 114 mlgr %r10, %r6 115 lgr %r5, n 116 stg %r11, 8(rp) 117 cr %r15, %r15 C clear carry flag 118 119 L(tm): lg %r1, 0(%r12,up) 120 mlgr %r0, %r6 121 alcgr %r1, %r10 122 lgr %r10, %r0 C copy high part to carry limb 123 stg %r1, 0(%r12,rp) 124 la %r12, 8(%r12) 125 brctg %r5, L(tm) 126 127 alcgr %r0, zero 128 stg %r0, 0(%r12,rp) 129 130 C addmul_1 loop =============================================================== 131 132 aghi n, -1 133 je L(outer_end) 134 L(outer_loop): 135 136 la rp, 16(rp) C rp += 2 137 la up, 8(up) C up += 1 138 lg %r6, 0(up) 139 lg %r11, 8(up) 140 lghi %r12, 16 C init index register 141 mlgr %r10, %r6 142 lgr %r5, n 143 alg %r11, 8(rp) 144 stg %r11, 8(rp) 145 146 L(tam): lg %r1, 0(%r12,up) 147 lg %r7, 0(%r12,rp) 148 mlgr %r0, %r6 149 alcgr %r1, %r7 150 alcgr %r0, zero 151 algr %r1, %r10 152 lgr %r10, %r0 153 stg %r1, 0(%r12,rp) 154 la %r12, 8(%r12) 155 brctg %r5, L(tam) 156 157 alcgr %r0, zero 158 stg %r0, 0(%r12,rp) 159 160 brctg n, L(outer_loop) 161 L(outer_end): 162 163 lg %r6, 8(up) 164 lg %r1, 16(up) 165 lgr %r7, %r0 C Same as: lg %r7, 24(,rp) 166 mlgr %r0, %r6 167 algr %r1, %r7 168 alcgr %r0, zero 169 stg %r1, 24(rp) 170 stg %r0, 32(rp) 171 172 C sqr_diag_addlsh1 ============================================================ 173 174 define(`up', `up_saved') 175 define(`rp', `rp_saved') 176 la n, 1(n_saved) 177 178 lg %r1, 0(up) 179 mlgr %r0, %r1 180 stg %r1, 0(rp) 181 C clr %r15, %r15 C clear carry (already clear per above) 182 183 L(top): lg %r11, 8(up) 184 la up, 8(up) 185 lg %r6, 8(rp) 186 lg %r7, 16(rp) 187 mlgr %r10, %r11 188 alcgr %r6, %r6 189 alcgr %r7, %r7 190 alcgr %r10, zero C propagate carry to high product limb 191 algr %r6, %r0 192 alcgr %r7, %r11 193 stmg %r6, %r7, 8(rp) 194 la rp, 16(rp) 195 lgr %r0, %r10 C copy carry limb 196 brctg n, L(top) 197 198 alcgr %r0, zero 199 stg %r0, 8(rp) 200 201 lmg %r6, %r14, 48(%r15) 202 br %r14 203 EPILOGUE()