github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc32/v9/sqr_diagonal.asm (about) 1 dnl SPARC v9 32-bit mpn_sqr_diagonal. 2 3 dnl Copyright 2001, 2003 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 32 include(`../config.m4') 33 34 C INPUT PARAMETERS 35 C rp i0 36 C up i1 37 C n i2 38 39 C This code uses a very deep software pipeline, due to the need for moving data 40 C forth and back between the integer registers and floating-point registers. 41 C 42 C A VIS variant of this code would make the pipeline less deep, since the 43 C masking now done in the integer unit could take place in the floating-point 44 C unit using the FAND instruction. It would be possible to save several cycles 45 C too. 46 C 47 C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and 48 C not much slower from the Ecache. It would perhaps be possible to shave off 49 C one cycle, but not easily. We cannot do better than 10 cycles/limb with the 50 C used instructions, since we have 10 memory operations per limb. But a VIS 51 C variant could run three cycles faster than the corresponding non-VIS code. 52 53 C This is non-pipelined code showing the algorithm: 54 C 55 C .Loop: 56 C lduw [up+0],%g4 C 00000000hhhhllll 57 C sllx %g4,16,%g3 C 0000hhhhllll0000 58 C or %g3,%g4,%g2 C 0000hhhhXXXXllll 59 C andn %g2,%g5,%g2 C 0000hhhh0000llll 60 C stx %g2,[%fp+80] 61 C ldd [%fp+80],%f0 62 C fitod %f0,%f4 C hi16 63 C fitod %f1,%f6 C lo16 64 C ld [up+0],%f9 65 C fxtod %f8,%f2 66 C fmuld %f2,%f4,%f4 67 C fmuld %f2,%f6,%f6 68 C fdtox %f4,%f4 69 C fdtox %f6,%f6 70 C std %f4,[%fp-24] 71 C std %f6,[%fp-16] 72 C ldx [%fp-24],%g2 73 C ldx [%fp-16],%g1 74 C sllx %g2,16,%g2 75 C add %g2,%g1,%g1 76 C stw %g1,[rp+0] 77 C srlx %g1,32,%l0 78 C stw %l0,[rp+4] 79 C add up,4,up 80 C subcc n,1,n 81 C bne,pt %icc,.Loop 82 C add rp,8,rp 83 84 define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe 85 86 ASM_START() 87 88 TEXT 89 ALIGN(4) 90 .Lnoll: 91 .word 0 92 93 PROLOGUE(mpn_sqr_diagonal) 94 save %sp,-256,%sp 95 96 ifdef(`PIC', 97 `.Lpc: rd %pc,%o7 98 ld [%o7+.Lnoll-.Lpc],%f8', 99 ` sethi %hi(.Lnoll),%g1 100 ld [%g1+%lo(.Lnoll)],%f8') 101 102 sethi %hi(0xffff0000),%g5 103 add %i1,-8,%i1 104 105 lduw [%i1+8],%g4 106 add %i1,4,%i1 C s1_ptr++ 107 sllx %g4,16,%g3 C 0000hhhhllll0000 108 or %g3,%g4,%g2 C 0000hhhhXXXXllll 109 subcc %i2,1,%i2 110 bne,pt %icc,.L_grt_1 111 andn %g2,%g5,%g2 C 0000hhhh0000llll 112 113 add %i1,4,%i1 C s1_ptr++ 114 stx %g2,[%fp+80] 115 ld [%i1],%f9 116 ldd [%fp+80],%f0 117 fxtod %f8,%f2 118 fitod %f0,%f4 119 fitod %f1,%f6 120 fmuld %f2,%f4,%f4 121 fmuld %f2,%f6,%f6 122 fdtox %f4,%f4 123 fdtox %f6,%f6 124 std %f4,[%fp-24] 125 std %f6,[%fp-16] 126 127 add %fp, 80, %l3 128 add %fp, -24, %l4 129 add %fp, 72, %l5 130 b .L1 131 add %fp, -40, %l6 132 133 .L_grt_1: 134 stx %g2,[%fp+80] 135 lduw [%i1+8],%g4 136 add %i1,4,%i1 C s1_ptr++ 137 sllx %g4,16,%g3 C 0000hhhhllll0000 138 or %g3,%g4,%g2 C 0000hhhhXXXXllll 139 subcc %i2,1,%i2 140 bne,pt %icc,.L_grt_2 141 andn %g2,%g5,%g2 C 0000hhhh0000llll 142 143 stx %g2,[%fp+72] 144 ld [%i1],%f9 145 add %i1,4,%i1 C s1_ptr++ 146 ldd [%fp+80],%f0 147 fxtod %f8,%f2 148 fitod %f0,%f4 149 fitod %f1,%f6 150 fmuld %f2,%f4,%f4 151 ld [%i1],%f9 152 fmuld %f2,%f6,%f6 153 ldd [%fp+72],%f0 154 fdtox %f4,%f4 155 fdtox %f6,%f6 156 std %f4,[%fp-24] 157 fxtod %f8,%f2 158 std %f6,[%fp-16] 159 fitod %f0,%f4 160 fitod %f1,%f6 161 fmuld %f2,%f4,%f4 162 fmuld %f2,%f6,%f6 163 fdtox %f4,%f4 164 165 add %fp, 72, %l3 166 add %fp, -40, %l4 167 add %fp, 80, %l5 168 b .L2 169 add %fp, -24, %l6 170 171 .L_grt_2: 172 stx %g2,[%fp+72] 173 lduw [%i1+8],%g4 174 ld [%i1],%f9 175 add %i1,4,%i1 C s1_ptr++ 176 ldd [%fp+80],%f0 177 sllx %g4,16,%g3 C 0000hhhhllll0000 178 or %g3,%g4,%g2 C 0000hhhhXXXXllll 179 subcc %i2,1,%i2 180 fxtod %f8,%f2 181 bne,pt %icc,.L_grt_3 182 andn %g2,%g5,%g2 C 0000hhhh0000llll 183 184 stx %g2,[%fp+80] 185 fitod %f0,%f4 186 fitod %f1,%f6 187 fmuld %f2,%f4,%f4 188 ld [%i1],%f9 189 fmuld %f2,%f6,%f6 190 add %i1,4,%i1 C s1_ptr++ 191 ldd [%fp+72],%f0 192 fdtox %f4,%f4 193 fdtox %f6,%f6 194 std %f4,[%fp-24] 195 fxtod %f8,%f2 196 std %f6,[%fp-16] 197 fitod %f0,%f4 198 fitod %f1,%f6 199 fmuld %f2,%f4,%f4 200 ld [%i1],%f9 201 add %fp, 80, %l3 202 fmuld %f2,%f6,%f6 203 add %fp, -24, %l4 204 ldd [%fp+80],%f0 205 add %fp, 72, %l5 206 fdtox %f4,%f4 207 b .L3 208 add %fp, -40, %l6 209 210 .L_grt_3: 211 stx %g2,[%fp+80] 212 fitod %f0,%f4 213 lduw [%i1+8],%g4 214 fitod %f1,%f6 215 fmuld %f2,%f4,%f4 216 ld [%i1],%f9 217 fmuld %f2,%f6,%f6 218 add %i1,4,%i1 C s1_ptr++ 219 ldd [%fp+72],%f0 220 fdtox %f4,%f4 221 sllx %g4,16,%g3 C 0000hhhhllll0000 222 fdtox %f6,%f6 223 or %g3,%g4,%g2 C 0000hhhhXXXXllll 224 subcc %i2,1,%i2 225 std %f4,[%fp-24] 226 fxtod %f8,%f2 227 std %f6,[%fp-16] 228 bne,pt %icc,.L_grt_4 229 andn %g2,%g5,%g2 C 0000hhhh0000llll 230 231 stx %g2,[%fp+72] 232 fitod %f0,%f4 233 fitod %f1,%f6 234 add %fp, 72, %l3 235 fmuld %f2,%f4,%f4 236 add %fp, -40, %l4 237 ld [%i1],%f9 238 fmuld %f2,%f6,%f6 239 add %i1,4,%i1 C s1_ptr++ 240 ldd [%fp+80],%f0 241 add %fp, 80, %l5 242 fdtox %f4,%f4 243 b .L4 244 add %fp, -24, %l6 245 246 .L_grt_4: 247 stx %g2,[%fp+72] 248 fitod %f0,%f4 249 lduw [%i1+8],%g4 250 fitod %f1,%f6 251 fmuld %f2,%f4,%f4 252 ld [%i1],%f9 253 fmuld %f2,%f6,%f6 254 add %i1,4,%i1 C s1_ptr++ 255 ldd [%fp+80],%f0 256 fdtox %f4,%f4 257 sllx %g4,16,%g3 C 0000hhhhllll0000 258 fdtox %f6,%f6 259 or %g3,%g4,%g2 C 0000hhhhXXXXllll 260 subcc %i2,1,%i2 261 std %f4,[%fp-40] 262 fxtod %f8,%f2 263 std %f6,[%fp-32] 264 be,pn %icc,.L5 265 andn %g2,%g5,%g2 C 0000hhhh0000llll 266 267 b,a .Loop 268 269 .align 16 270 C --- LOOP BEGIN 271 .Loop: nop 272 nop 273 stx %g2,[%fp+80] 274 fitod %f0,%f4 275 C --- 276 nop 277 nop 278 lduw [%i1+8],%g4 279 fitod %f1,%f6 280 C --- 281 nop 282 nop 283 ldx [%fp-24],%g2 C p16 284 fanop 285 C --- 286 nop 287 nop 288 ldx [%fp-16],%g1 C p0 289 fmuld %f2,%f4,%f4 290 C --- 291 sllx %g2,16,%g2 C align p16 292 add %i0,8,%i0 C res_ptr++ 293 ld [%i1],%f9 294 fmuld %f2,%f6,%f6 295 C --- 296 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 297 add %i1,4,%i1 C s1_ptr++ 298 ldd [%fp+72],%f0 299 fanop 300 C --- 301 srlx %g1,32,%l0 302 nop 303 stw %g1,[%i0-8] 304 fdtox %f4,%f4 305 C --- 306 sllx %g4,16,%g3 C 0000hhhhllll0000 307 nop 308 stw %l0,[%i0-4] 309 fdtox %f6,%f6 310 C --- 311 or %g3,%g4,%g2 C 0000hhhhXXXXllll 312 subcc %i2,1,%i2 313 std %f4,[%fp-24] 314 fxtod %f8,%f2 315 C --- 316 std %f6,[%fp-16] 317 andn %g2,%g5,%g2 C 0000hhhh0000llll 318 be,pn %icc,.Lend 319 fanop 320 C --- LOOP MIDDLE 321 nop 322 nop 323 stx %g2,[%fp+72] 324 fitod %f0,%f4 325 C --- 326 nop 327 nop 328 lduw [%i1+8],%g4 329 fitod %f1,%f6 330 C --- 331 nop 332 nop 333 ldx [%fp-40],%g2 C p16 334 fanop 335 C --- 336 nop 337 nop 338 ldx [%fp-32],%g1 C p0 339 fmuld %f2,%f4,%f4 340 C --- 341 sllx %g2,16,%g2 C align p16 342 add %i0,8,%i0 C res_ptr++ 343 ld [%i1],%f9 344 fmuld %f2,%f6,%f6 345 C --- 346 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 347 add %i1,4,%i1 C s1_ptr++ 348 ldd [%fp+80],%f0 349 fanop 350 C --- 351 srlx %g1,32,%l0 352 nop 353 stw %g1,[%i0-8] 354 fdtox %f4,%f4 355 C --- 356 sllx %g4,16,%g3 C 0000hhhhllll0000 357 nop 358 stw %l0,[%i0-4] 359 fdtox %f6,%f6 360 C --- 361 or %g3,%g4,%g2 C 0000hhhhXXXXllll 362 subcc %i2,1,%i2 363 std %f4,[%fp-40] 364 fxtod %f8,%f2 365 C --- 366 std %f6,[%fp-32] 367 andn %g2,%g5,%g2 C 0000hhhh0000llll 368 bne,pt %icc,.Loop 369 fanop 370 C --- LOOP END 371 372 .L5: add %fp, 80, %l3 373 add %fp, -24, %l4 374 add %fp, 72, %l5 375 b .Ltail 376 add %fp, -40, %l6 377 378 .Lend: add %fp, 72, %l3 379 add %fp, -40, %l4 380 add %fp, 80, %l5 381 add %fp, -24, %l6 382 .Ltail: stx %g2,[%l3] 383 fitod %f0,%f4 384 fitod %f1,%f6 385 ldx [%l4],%g2 C p16 386 ldx [%l4+8],%g1 C p0 387 fmuld %f2,%f4,%f4 388 sllx %g2,16,%g2 C align p16 389 add %i0,8,%i0 C res_ptr++ 390 ld [%i1],%f9 391 fmuld %f2,%f6,%f6 392 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 393 add %i1,4,%i1 C s1_ptr++ 394 ldd [%l5],%f0 395 srlx %g1,32,%l0 396 stw %g1,[%i0-8] 397 fdtox %f4,%f4 398 stw %l0,[%i0-4] 399 .L4: fdtox %f6,%f6 400 std %f4,[%l4] 401 fxtod %f8,%f2 402 std %f6,[%l4+8] 403 404 fitod %f0,%f4 405 fitod %f1,%f6 406 ldx [%l6],%g2 C p16 407 ldx [%l6+8],%g1 C p0 408 fmuld %f2,%f4,%f4 409 sllx %g2,16,%g2 C align p16 410 add %i0,8,%i0 C res_ptr++ 411 ld [%i1],%f9 412 fmuld %f2,%f6,%f6 413 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 414 ldd [%l3],%f0 415 srlx %g1,32,%l0 416 stw %g1,[%i0-8] 417 fdtox %f4,%f4 418 stw %l0,[%i0-4] 419 .L3: fdtox %f6,%f6 420 std %f4,[%l6] 421 fxtod %f8,%f2 422 std %f6,[%l6+8] 423 424 fitod %f0,%f4 425 fitod %f1,%f6 426 ldx [%l4],%g2 C p16 427 ldx [%l4+8],%g1 C p0 428 fmuld %f2,%f4,%f4 429 sllx %g2,16,%g2 C align p16 430 add %i0,8,%i0 C res_ptr++ 431 fmuld %f2,%f6,%f6 432 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 433 srlx %g1,32,%l0 434 stw %g1,[%i0-8] 435 fdtox %f4,%f4 436 stw %l0,[%i0-4] 437 .L2: fdtox %f6,%f6 438 std %f4,[%l4] 439 std %f6,[%l4+8] 440 441 ldx [%l6],%g2 C p16 442 ldx [%l6+8],%g1 C p0 443 sllx %g2,16,%g2 C align p16 444 add %i0,8,%i0 C res_ptr++ 445 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 446 srlx %g1,32,%l0 447 stw %g1,[%i0-8] 448 stw %l0,[%i0-4] 449 450 .L1: ldx [%l4],%g2 C p16 451 ldx [%l4+8],%g1 C p0 452 sllx %g2,16,%g2 C align p16 453 add %i0,8,%i0 C res_ptr++ 454 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 455 srlx %g1,32,%l0 456 stw %g1,[%i0-8] 457 stw %l0,[%i0-4] 458 459 ret 460 restore %g0,%g0,%o0 461 462 EPILOGUE(mpn_sqr_diagonal)