github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/s390_32/esame/sqr_basecase.asm (about) 1 dnl S/390-32 mpn_sqr_basecase. 2 3 dnl Copyright 2011 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C z900 ? 35 C z990 23 36 C z9 ? 37 C z10 ? 38 C z196 ? 39 40 C TODO 41 C * Clean up. 42 C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail. 43 C This will ask for basecase handling of n = 3. 44 C * Update counters and pointers more straightforwardly, possibly lowering 45 C register usage. 46 C * Should we use this allocation-free style for more sqr_basecase asm 47 C implementations? The only disadvantage is that it requires R != U. 48 C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped 49 C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even 50 C more. 51 52 C INPUT PARAMETERS 53 define(`rp', `%r2') 54 define(`up', `%r3') 55 define(`n', `%r4') 56 57 define(`zero', `%r8') 58 define(`rp_saved', `%r9') 59 define(`up_saved', `%r13') 60 define(`n_saved', `%r14') 61 62 ASM_START() 63 PROLOGUE(mpn_sqr_basecase) 64 ahi n, -2 65 jhe L(ge2) 66 67 C n = 1 68 l %r5, 0(up) 69 mlr %r4, %r5 70 st %r5, 0(rp) 71 st %r4, 4(rp) 72 br %r14 73 74 L(ge2): jne L(gen) 75 76 C n = 2 77 stm %r6, %r8, 24(%r15) 78 lhi zero, 0 79 80 l %r5, 0(up) 81 mlr %r4, %r5 C u0 * u0 82 l %r1, 4(up) 83 mlr %r0, %r1 C u1 * u1 84 st %r5, 0(rp) 85 86 l %r7, 0(up) 87 ml %r6, 4(up) C u0 * u1 88 alr %r7, %r7 89 alcr %r6, %r6 90 alcr %r0, zero 91 92 alr %r4, %r7 93 alcr %r1, %r6 94 alcr %r0, zero 95 st %r4, 4(rp) 96 st %r1, 8(rp) 97 st %r0, 12(rp) 98 99 lm %r6, %r8, 24(%r15) 100 br %r14 101 102 L(gen): 103 C mul_1 ======================================================================= 104 105 stm %r6, %r14, 24(%r15) 106 lhi zero, 0 107 lr up_saved, up 108 lr rp_saved, rp 109 lr n_saved, n 110 111 l %r6, 0(up) 112 l %r11, 4(up) 113 lhi %r12, 8 C init index register 114 mlr %r10, %r6 115 lr %r5, n 116 st %r11, 4(rp) 117 cr %r15, %r15 C clear carry flag 118 119 L(tm): l %r1, 0(%r12,up) 120 mlr %r0, %r6 121 alcr %r1, %r10 122 lr %r10, %r0 C copy high part to carry limb 123 st %r1, 0(%r12,rp) 124 la %r12, 4(%r12) 125 brct %r5, L(tm) 126 127 alcr %r0, zero 128 st %r0, 0(%r12,rp) 129 130 C addmul_1 loop =============================================================== 131 132 ahi n, -1 133 je L(outer_end) 134 L(outer_loop): 135 136 la rp, 8(rp) C rp += 2 137 la up, 4(up) C up += 1 138 l %r6, 0(up) 139 l %r11, 4(up) 140 lhi %r12, 8 C init index register 141 mlr %r10, %r6 142 lr %r5, n 143 al %r11, 4(rp) 144 st %r11, 4(rp) 145 146 L(tam): l %r1, 0(%r12,up) 147 l %r7, 0(%r12,rp) 148 mlr %r0, %r6 149 alcr %r1, %r7 150 alcr %r0, zero 151 alr %r1, %r10 152 lr %r10, %r0 153 st %r1, 0(%r12,rp) 154 la %r12, 4(%r12) 155 brct %r5, L(tam) 156 157 alcr %r0, zero 158 st %r0, 0(%r12,rp) 159 160 brct n, L(outer_loop) 161 L(outer_end): 162 163 l %r6, 4(up) 164 l %r1, 8(up) 165 lr %r7, %r0 C Same as: l %r7, 12(,rp) 166 mlr %r0, %r6 167 alr %r1, %r7 168 alcr %r0, zero 169 st %r1, 12(rp) 170 st %r0, 16(rp) 171 172 C sqr_dia_addlsh1 ============================================================ 173 174 define(`up', `up_saved') 175 define(`rp', `rp_saved') 176 la n, 1(n_saved) 177 178 l %r1, 0(up) 179 mlr %r0, %r1 180 st %r1, 0(rp) 181 C clr %r15, %r15 C clear carry (already clear per above) 182 183 L(top): l %r11, 4(up) 184 la up, 4(up) 185 l %r6, 4(rp) 186 l %r7, 8(rp) 187 mlr %r10, %r11 188 alcr %r6, %r6 189 alcr %r7, %r7 190 alcr %r10, zero C propagate carry to high product limb 191 alr %r6, %r0 192 alcr %r7, %r11 193 stm %r6, %r7, 4(rp) 194 la rp, 8(rp) 195 lr %r0, %r10 C copy carry limb 196 brct n, L(top) 197 198 alcr %r0, zero 199 st %r0, 4(rp) 200 201 lm %r6, %r14, 24(%r15) 202 br %r14 203 EPILOGUE()