github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc32/v9/sqr_diagonal.asm (about)

     1  dnl  SPARC v9 32-bit mpn_sqr_diagonal.
     2  
     3  dnl  Copyright 2001, 2003 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  
    32  include(`../config.m4')
    33  
    34  C INPUT PARAMETERS
    35  C rp	i0
    36  C up	i1
    37  C n	i2
    38  
    39  C This code uses a very deep software pipeline, due to the need for moving data
    40  C forth and back between the integer registers and floating-point registers.
    41  C
    42  C A VIS variant of this code would make the pipeline less deep, since the
    43  C masking now done in the integer unit could take place in the floating-point
    44  C unit using the FAND instruction.  It would be possible to save several cycles
    45  C too.
    46  C
    47  C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
    48  C not much slower from the Ecache.  It would perhaps be possible to shave off
    49  C one cycle, but not easily.  We cannot do better than 10 cycles/limb with the
    50  C used instructions, since we have 10 memory operations per limb.  But a VIS
    51  C variant could run three cycles faster than the corresponding non-VIS code.
    52  
    53  C This is non-pipelined code showing the algorithm:
    54  C
    55  C .Loop:
    56  C	lduw	[up+0],%g4		C 00000000hhhhllll
    57  C	sllx	%g4,16,%g3		C 0000hhhhllll0000
    58  C	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
    59  C	andn	%g2,%g5,%g2		C 0000hhhh0000llll
    60  C	stx	%g2,[%fp+80]
    61  C	ldd	[%fp+80],%f0
    62  C	fitod	%f0,%f4			C hi16
    63  C	fitod	%f1,%f6			C lo16
    64  C	ld	[up+0],%f9
    65  C	fxtod	%f8,%f2
    66  C	fmuld	%f2,%f4,%f4
    67  C	fmuld	%f2,%f6,%f6
    68  C	fdtox	%f4,%f4
    69  C	fdtox	%f6,%f6
    70  C	std	%f4,[%fp-24]
    71  C	std	%f6,[%fp-16]
    72  C	ldx	[%fp-24],%g2
    73  C	ldx	[%fp-16],%g1
    74  C	sllx	%g2,16,%g2
    75  C	add	%g2,%g1,%g1
    76  C	stw	%g1,[rp+0]
    77  C	srlx	%g1,32,%l0
    78  C	stw	%l0,[rp+4]
    79  C	add	up,4,up
    80  C	subcc	n,1,n
    81  C	bne,pt	%icc,.Loop
    82  C	add	rp,8,rp
    83  
    84  define(`fanop',`fitod %f12,%f10')	dnl  A quasi nop running in the FA pipe
    85  
    86  ASM_START()
    87  
    88  	TEXT
    89  	ALIGN(4)
    90  .Lnoll:
    91  	.word	0
    92  
    93  PROLOGUE(mpn_sqr_diagonal)
    94  	save	%sp,-256,%sp
    95  
    96  ifdef(`PIC',
    97  `.Lpc:	rd	%pc,%o7
    98  	ld	[%o7+.Lnoll-.Lpc],%f8',
    99  `	sethi	%hi(.Lnoll),%g1
   100  	ld	[%g1+%lo(.Lnoll)],%f8')
   101  
   102  	sethi	%hi(0xffff0000),%g5
   103  	add	%i1,-8,%i1
   104  
   105  	lduw	[%i1+8],%g4
   106  	add	%i1,4,%i1		C s1_ptr++
   107  	sllx	%g4,16,%g3		C 0000hhhhllll0000
   108  	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
   109  	subcc	%i2,1,%i2
   110  	bne,pt	%icc,.L_grt_1
   111  	andn	%g2,%g5,%g2		C 0000hhhh0000llll
   112  
   113  	add	%i1,4,%i1		C s1_ptr++
   114  	stx	%g2,[%fp+80]
   115  	ld	[%i1],%f9
   116  	ldd	[%fp+80],%f0
   117  	fxtod	%f8,%f2
   118  	fitod	%f0,%f4
   119  	fitod	%f1,%f6
   120  	fmuld	%f2,%f4,%f4
   121  	fmuld	%f2,%f6,%f6
   122  	fdtox	%f4,%f4
   123  	fdtox	%f6,%f6
   124  	std	%f4,[%fp-24]
   125  	std	%f6,[%fp-16]
   126  
   127  	add	%fp, 80, %l3
   128  	add	%fp, -24, %l4
   129  	add	%fp, 72, %l5
   130  	b	.L1
   131  	add	%fp, -40, %l6
   132  
   133  .L_grt_1:
   134  	stx	%g2,[%fp+80]
   135  	lduw	[%i1+8],%g4
   136  	add	%i1,4,%i1		C s1_ptr++
   137  	sllx	%g4,16,%g3		C 0000hhhhllll0000
   138  	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
   139  	subcc	%i2,1,%i2
   140  	bne,pt	%icc,.L_grt_2
   141  	andn	%g2,%g5,%g2		C 0000hhhh0000llll
   142  
   143  	stx	%g2,[%fp+72]
   144  	ld	[%i1],%f9
   145  	add	%i1,4,%i1		C s1_ptr++
   146  	ldd	[%fp+80],%f0
   147  	fxtod	%f8,%f2
   148  	fitod	%f0,%f4
   149  	fitod	%f1,%f6
   150  	fmuld	%f2,%f4,%f4
   151  	ld	[%i1],%f9
   152  	fmuld	%f2,%f6,%f6
   153  	ldd	[%fp+72],%f0
   154  	fdtox	%f4,%f4
   155  	fdtox	%f6,%f6
   156  	std	%f4,[%fp-24]
   157  	fxtod	%f8,%f2
   158  	std	%f6,[%fp-16]
   159  	fitod	%f0,%f4
   160  	fitod	%f1,%f6
   161  	fmuld	%f2,%f4,%f4
   162  	fmuld	%f2,%f6,%f6
   163  	fdtox	%f4,%f4
   164  
   165  	add	%fp, 72, %l3
   166  	add	%fp, -40, %l4
   167  	add	%fp, 80, %l5
   168  	b	.L2
   169  	add	%fp, -24, %l6
   170  
   171  .L_grt_2:
   172  	stx	%g2,[%fp+72]
   173  	lduw	[%i1+8],%g4
   174  	ld	[%i1],%f9
   175  	add	%i1,4,%i1		C s1_ptr++
   176  	ldd	[%fp+80],%f0
   177  	sllx	%g4,16,%g3		C 0000hhhhllll0000
   178  	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
   179  	subcc	%i2,1,%i2
   180  	fxtod	%f8,%f2
   181  	bne,pt	%icc,.L_grt_3
   182  	andn	%g2,%g5,%g2		C 0000hhhh0000llll
   183  
   184  	stx	%g2,[%fp+80]
   185  	fitod	%f0,%f4
   186  	fitod	%f1,%f6
   187  	fmuld	%f2,%f4,%f4
   188  	ld	[%i1],%f9
   189  	fmuld	%f2,%f6,%f6
   190  	add	%i1,4,%i1		C s1_ptr++
   191  	ldd	[%fp+72],%f0
   192  	fdtox	%f4,%f4
   193  	fdtox	%f6,%f6
   194  	std	%f4,[%fp-24]
   195  	fxtod	%f8,%f2
   196  	std	%f6,[%fp-16]
   197  	fitod	%f0,%f4
   198  	fitod	%f1,%f6
   199  	fmuld	%f2,%f4,%f4
   200  	ld	[%i1],%f9
   201  	add	%fp, 80, %l3
   202  	fmuld	%f2,%f6,%f6
   203  	add	%fp, -24, %l4
   204  	ldd	[%fp+80],%f0
   205  	add	%fp, 72, %l5
   206  	fdtox	%f4,%f4
   207  	b	.L3
   208  	add	%fp, -40, %l6
   209  
   210  .L_grt_3:
   211  	stx	%g2,[%fp+80]
   212  	fitod	%f0,%f4
   213  	lduw	[%i1+8],%g4
   214  	fitod	%f1,%f6
   215  	fmuld	%f2,%f4,%f4
   216  	ld	[%i1],%f9
   217  	fmuld	%f2,%f6,%f6
   218  	add	%i1,4,%i1		C s1_ptr++
   219  	ldd	[%fp+72],%f0
   220  	fdtox	%f4,%f4
   221  	sllx	%g4,16,%g3		C 0000hhhhllll0000
   222  	fdtox	%f6,%f6
   223  	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
   224  	subcc	%i2,1,%i2
   225  	std	%f4,[%fp-24]
   226  	fxtod	%f8,%f2
   227  	std	%f6,[%fp-16]
   228  	bne,pt	%icc,.L_grt_4
   229  	andn	%g2,%g5,%g2		C 0000hhhh0000llll
   230  
   231  	stx	%g2,[%fp+72]
   232  	fitod	%f0,%f4
   233  	fitod	%f1,%f6
   234  	add	%fp, 72, %l3
   235  	fmuld	%f2,%f4,%f4
   236  	add	%fp, -40, %l4
   237  	ld	[%i1],%f9
   238  	fmuld	%f2,%f6,%f6
   239  	add	%i1,4,%i1		C s1_ptr++
   240  	ldd	[%fp+80],%f0
   241  	add	%fp, 80, %l5
   242  	fdtox	%f4,%f4
   243  	b	.L4
   244  	add	%fp, -24, %l6
   245  
   246  .L_grt_4:
   247  	stx	%g2,[%fp+72]
   248  	fitod	%f0,%f4
   249  	lduw	[%i1+8],%g4
   250  	fitod	%f1,%f6
   251  	fmuld	%f2,%f4,%f4
   252  	ld	[%i1],%f9
   253  	fmuld	%f2,%f6,%f6
   254  	add	%i1,4,%i1		C s1_ptr++
   255  	ldd	[%fp+80],%f0
   256  	fdtox	%f4,%f4
   257  	sllx	%g4,16,%g3		C 0000hhhhllll0000
   258  	fdtox	%f6,%f6
   259  	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
   260  	subcc	%i2,1,%i2
   261  	std	%f4,[%fp-40]
   262  	fxtod	%f8,%f2
   263  	std	%f6,[%fp-32]
   264  	be,pn	%icc,.L5
   265  	andn	%g2,%g5,%g2		C 0000hhhh0000llll
   266  
   267  	b,a	.Loop
   268  
   269  	.align	16
   270  C --- LOOP BEGIN
   271  .Loop:	nop
   272  	nop
   273  	stx	%g2,[%fp+80]
   274  	fitod	%f0,%f4
   275  C ---
   276  	nop
   277  	nop
   278  	lduw	[%i1+8],%g4
   279  	fitod	%f1,%f6
   280  C ---
   281  	nop
   282  	nop
   283  	ldx	[%fp-24],%g2		C p16
   284  	fanop
   285  C ---
   286  	nop
   287  	nop
   288  	ldx	[%fp-16],%g1		C p0
   289  	fmuld	%f2,%f4,%f4
   290  C ---
   291  	sllx	%g2,16,%g2		C align p16
   292  	add	%i0,8,%i0		C res_ptr++
   293  	ld	[%i1],%f9
   294  	fmuld	%f2,%f6,%f6
   295  C ---
   296  	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
   297  	add	%i1,4,%i1		C s1_ptr++
   298  	ldd	[%fp+72],%f0
   299  	fanop
   300  C ---
   301  	srlx	%g1,32,%l0
   302  	nop
   303  	stw	%g1,[%i0-8]
   304  	fdtox	%f4,%f4
   305  C ---
   306  	sllx	%g4,16,%g3		C 0000hhhhllll0000
   307  	nop
   308  	stw	%l0,[%i0-4]
   309  	fdtox	%f6,%f6
   310  C ---
   311  	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
   312  	subcc	%i2,1,%i2
   313  	std	%f4,[%fp-24]
   314  	fxtod	%f8,%f2
   315  C ---
   316  	std	%f6,[%fp-16]
   317  	andn	%g2,%g5,%g2		C 0000hhhh0000llll
   318  	be,pn	%icc,.Lend
   319  	fanop
   320  C ---  LOOP MIDDLE
   321  	nop
   322  	nop
   323  	stx	%g2,[%fp+72]
   324  	fitod	%f0,%f4
   325  C ---
   326  	nop
   327  	nop
   328  	lduw	[%i1+8],%g4
   329  	fitod	%f1,%f6
   330  C ---
   331  	nop
   332  	nop
   333  	ldx	[%fp-40],%g2		C p16
   334  	fanop
   335  C ---
   336  	nop
   337  	nop
   338  	ldx	[%fp-32],%g1		C p0
   339  	fmuld	%f2,%f4,%f4
   340  C ---
   341  	sllx	%g2,16,%g2		C align p16
   342  	add	%i0,8,%i0		C res_ptr++
   343  	ld	[%i1],%f9
   344  	fmuld	%f2,%f6,%f6
   345  C ---
   346  	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
   347  	add	%i1,4,%i1		C s1_ptr++
   348  	ldd	[%fp+80],%f0
   349  	fanop
   350  C ---
   351  	srlx	%g1,32,%l0
   352  	nop
   353  	stw	%g1,[%i0-8]
   354  	fdtox	%f4,%f4
   355  C ---
   356  	sllx	%g4,16,%g3		C 0000hhhhllll0000
   357  	nop
   358  	stw	%l0,[%i0-4]
   359  	fdtox	%f6,%f6
   360  C ---
   361  	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
   362  	subcc	%i2,1,%i2
   363  	std	%f4,[%fp-40]
   364  	fxtod	%f8,%f2
   365  C ---
   366  	std	%f6,[%fp-32]
   367  	andn	%g2,%g5,%g2		C 0000hhhh0000llll
   368  	bne,pt	%icc,.Loop
   369  	fanop
   370  C --- LOOP END
   371  
   372  .L5:	add	%fp, 80, %l3
   373  	add	%fp, -24, %l4
   374  	add	%fp, 72, %l5
   375  	b	.Ltail
   376  	add	%fp, -40, %l6
   377  
   378  .Lend:	add	%fp, 72, %l3
   379  	add	%fp, -40, %l4
   380  	add	%fp, 80, %l5
   381  	add	%fp, -24, %l6
   382  .Ltail:	stx	%g2,[%l3]
   383  	fitod	%f0,%f4
   384  	fitod	%f1,%f6
   385  	ldx	[%l4],%g2		C p16
   386  	ldx	[%l4+8],%g1		C p0
   387  	fmuld	%f2,%f4,%f4
   388  	sllx	%g2,16,%g2		C align p16
   389  	add	%i0,8,%i0		C res_ptr++
   390  	ld	[%i1],%f9
   391  	fmuld	%f2,%f6,%f6
   392  	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
   393  	add	%i1,4,%i1		C s1_ptr++
   394  	ldd	[%l5],%f0
   395  	srlx	%g1,32,%l0
   396  	stw	%g1,[%i0-8]
   397  	fdtox	%f4,%f4
   398  	stw	%l0,[%i0-4]
   399  .L4:	fdtox	%f6,%f6
   400  	std	%f4,[%l4]
   401  	fxtod	%f8,%f2
   402  	std	%f6,[%l4+8]
   403  
   404  	fitod	%f0,%f4
   405  	fitod	%f1,%f6
   406  	ldx	[%l6],%g2		C p16
   407  	ldx	[%l6+8],%g1		C p0
   408  	fmuld	%f2,%f4,%f4
   409  	sllx	%g2,16,%g2		C align p16
   410  	add	%i0,8,%i0		C res_ptr++
   411  	ld	[%i1],%f9
   412  	fmuld	%f2,%f6,%f6
   413  	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
   414  	ldd	[%l3],%f0
   415  	srlx	%g1,32,%l0
   416  	stw	%g1,[%i0-8]
   417  	fdtox	%f4,%f4
   418  	stw	%l0,[%i0-4]
   419  .L3:	fdtox	%f6,%f6
   420  	std	%f4,[%l6]
   421  	fxtod	%f8,%f2
   422  	std	%f6,[%l6+8]
   423  
   424  	fitod	%f0,%f4
   425  	fitod	%f1,%f6
   426  	ldx	[%l4],%g2		C p16
   427  	ldx	[%l4+8],%g1		C p0
   428  	fmuld	%f2,%f4,%f4
   429  	sllx	%g2,16,%g2		C align p16
   430  	add	%i0,8,%i0		C res_ptr++
   431  	fmuld	%f2,%f6,%f6
   432  	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
   433  	srlx	%g1,32,%l0
   434  	stw	%g1,[%i0-8]
   435  	fdtox	%f4,%f4
   436  	stw	%l0,[%i0-4]
   437  .L2:	fdtox	%f6,%f6
   438  	std	%f4,[%l4]
   439  	std	%f6,[%l4+8]
   440  
   441  	ldx	[%l6],%g2		C p16
   442  	ldx	[%l6+8],%g1		C p0
   443  	sllx	%g2,16,%g2		C align p16
   444  	add	%i0,8,%i0		C res_ptr++
   445  	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
   446  	srlx	%g1,32,%l0
   447  	stw	%g1,[%i0-8]
   448  	stw	%l0,[%i0-4]
   449  
   450  .L1:	ldx	[%l4],%g2		C p16
   451  	ldx	[%l4+8],%g1		C p0
   452  	sllx	%g2,16,%g2		C align p16
   453  	add	%i0,8,%i0		C res_ptr++
   454  	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
   455  	srlx	%g1,32,%l0
   456  	stw	%g1,[%i0-8]
   457  	stw	%l0,[%i0-4]
   458  
   459  	ret
   460  	restore	%g0,%g0,%o0
   461  
   462  EPILOGUE(mpn_sqr_diagonal)