github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc64/ultrasparc1234/add_n.asm (about)

     1  dnl  SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and
     2  dnl  store sum in a third limb vector.
     3  
     4  dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C		   cycles/limb
    35  C UltraSPARC 1&2:     4
    36  C UltraSPARC 3:	      4.5
    37  
    38  C Compute carry-out from the most significant bits of u,v, and r, where
    39  C r=u+v+carry_in, using logic operations.
    40  
    41  C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
    42  C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
    43  C Therefore, it seems futile to try to optimize this any further...
    44  
    45  C INPUT PARAMETERS
    46  define(`rp', `%i0')
    47  define(`up', `%i1')
    48  define(`vp', `%i2')
    49  define(`n',  `%i3')
    50  
    51  define(`u0', `%l0')
    52  define(`u1', `%l2')
    53  define(`u2', `%l4')
    54  define(`u3', `%l6')
    55  define(`v0', `%l1')
    56  define(`v1', `%l3')
    57  define(`v2', `%l5')
    58  define(`v3', `%l7')
    59  
    60  define(`cy',`%i4')
    61  
    62  define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
    63  define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
    64  
    65  ASM_START()
    66  	REGISTER(%g2,#scratch)
    67  	REGISTER(%g3,#scratch)
    68  PROLOGUE(mpn_add_nc)
    69  	save	%sp,-160,%sp
    70  
    71  	fitod	%f0,%f0		C make sure f0 contains small, quiet number
    72  	subcc	n,4,%g0
    73  	bl,pn	%xcc,.Loop0
    74  	nop
    75  	b,a	L(com)
    76  EPILOGUE()
    77  
    78  PROLOGUE(mpn_add_n)
    79  	save	%sp,-160,%sp
    80  
    81  	fitod	%f0,%f0		C make sure f0 contains small, quiet number
    82  	subcc	n,4,%g0
    83  	bl,pn	%xcc,.Loop0
    84  	mov	0,cy
    85  L(com):
    86  	ldx	[up+0],u0
    87  	ldx	[vp+0],v0
    88  	add	up,32,up
    89  	ldx	[up-24],u1
    90  	ldx	[vp+8],v1
    91  	add	vp,32,vp
    92  	ldx	[up-16],u2
    93  	ldx	[vp-16],v2
    94  	ldx	[up-8],u3
    95  	ldx	[vp-8],v3
    96  	subcc	n,8,n
    97  	add	u0,v0,%g1	C main add
    98  	add	%g1,cy,%g5	C carry add
    99  	or	u0,v0,%g2
   100  	bl,pn	%xcc,.Lend4567
   101  	fanop
   102  	b,a	.Loop
   103  
   104  	.align	16
   105  C START MAIN LOOP
   106  .Loop:	andn	%g2,%g5,%g2
   107  	and	u0,v0,%g3
   108  	ldx	[up+0],u0
   109  	fanop
   110  C --
   111  	or	%g3,%g2,%g2
   112  	ldx	[vp+0],v0
   113  	add	up,32,up
   114  	fanop
   115  C --
   116  	srlx	%g2,63,cy
   117  	add	u1,v1,%g1
   118  	stx	%g5,[rp+0]
   119  	fanop
   120  C --
   121  	add	%g1,cy,%g5
   122  	or	u1,v1,%g2
   123  	fmnop
   124  	fanop
   125  C --
   126  	andn	%g2,%g5,%g2
   127  	and	u1,v1,%g3
   128  	ldx	[up-24],u1
   129  	fanop
   130  C --
   131  	or	%g3,%g2,%g2
   132  	ldx	[vp+8],v1
   133  	add	vp,32,vp
   134  	fanop
   135  C --
   136  	srlx	%g2,63,cy
   137  	add	u2,v2,%g1
   138  	stx	%g5,[rp+8]
   139  	fanop
   140  C --
   141  	add	%g1,cy,%g5
   142  	or	u2,v2,%g2
   143  	fmnop
   144  	fanop
   145  C --
   146  	andn	%g2,%g5,%g2
   147  	and	u2,v2,%g3
   148  	ldx	[up-16],u2
   149  	fanop
   150  C --
   151  	or	%g3,%g2,%g2
   152  	ldx	[vp-16],v2
   153  	add	rp,32,rp
   154  	fanop
   155  C --
   156  	srlx	%g2,63,cy
   157  	add	u3,v3,%g1
   158  	stx	%g5,[rp-16]
   159  	fanop
   160  C --
   161  	add	%g1,cy,%g5
   162  	or	u3,v3,%g2
   163  	fmnop
   164  	fanop
   165  C --
   166  	andn	%g2,%g5,%g2
   167  	and	u3,v3,%g3
   168  	ldx	[up-8],u3
   169  	fanop
   170  C --
   171  	or	%g3,%g2,%g2
   172  	subcc	n,4,n
   173  	ldx	[vp-8],v3
   174  	fanop
   175  C --
   176  	srlx	%g2,63,cy
   177  	add	u0,v0,%g1
   178  	stx	%g5,[rp-8]
   179  	fanop
   180  C --
   181  	add	%g1,cy,%g5
   182  	or	u0,v0,%g2
   183  	bge,pt	%xcc,.Loop
   184  	fanop
   185  C END MAIN LOOP
   186  .Lend4567:
   187  	andn	%g2,%g5,%g2
   188  	and	u0,v0,%g3
   189  	or	%g3,%g2,%g2
   190  	srlx	%g2,63,cy
   191  	add	u1,v1,%g1
   192  	stx	%g5,[rp+0]
   193  	add	%g1,cy,%g5
   194  	or	u1,v1,%g2
   195  	andn	%g2,%g5,%g2
   196  	and	u1,v1,%g3
   197  	or	%g3,%g2,%g2
   198  	srlx	%g2,63,cy
   199  	add	u2,v2,%g1
   200  	stx	%g5,[rp+8]
   201  	add	%g1,cy,%g5
   202  	or	u2,v2,%g2
   203  	andn	%g2,%g5,%g2
   204  	and	u2,v2,%g3
   205  	or	%g3,%g2,%g2
   206  	add	rp,32,rp
   207  	srlx	%g2,63,cy
   208  	add	u3,v3,%g1
   209  	stx	%g5,[rp-16]
   210  	add	%g1,cy,%g5
   211  	or	u3,v3,%g2
   212  	andn	%g2,%g5,%g2
   213  	and	u3,v3,%g3
   214  	or	%g3,%g2,%g2
   215  	srlx	%g2,63,cy
   216  	stx	%g5,[rp-8]
   217  
   218  	addcc	n,4,n
   219  	bz,pn	%xcc,.Lret
   220  	fanop
   221  
   222  .Loop0:	ldx	[up],u0
   223  	add	up,8,up
   224  	ldx	[vp],v0
   225  	add	vp,8,vp
   226  	add	rp,8,rp
   227  	subcc	n,1,n
   228  	add	u0,v0,%g1
   229  	or	u0,v0,%g2
   230  	add	%g1,cy,%g5
   231  	and	u0,v0,%g3
   232  	andn	%g2,%g5,%g2
   233  	stx	%g5,[rp-8]
   234  	or	%g3,%g2,%g2
   235  	bnz,pt	%xcc,.Loop0
   236  	srlx	%g2,63,cy
   237  
   238  .Lret:	mov	cy,%i0
   239  	ret
   240  	restore
   241  EPILOGUE()