github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc64/ultrasparc1234/sub_n.asm (about)

     1  dnl  SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
     2  dnl  store difference in a third limb vector.
     3  
     4  dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C		   cycles/limb
    35  C UltraSPARC 1&2:     4
    36  C UltraSPARC 3:	      4.5
    37  
    38  C Compute carry-out from the most significant bits of u,v, and r, where
    39  C r=u-v-carry_in, using logic operations.
    40  
    41  C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
    42  C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
    43  C Therefore, it seems futile to try to optimize this any further...
    44  
    45  C INPUT PARAMETERS
    46  define(`rp',`%i0')
    47  define(`up',`%i1')
    48  define(`vp',`%i2')
    49  define(`n',`%i3')
    50  
    51  define(`u0',`%l0')
    52  define(`u1',`%l2')
    53  define(`u2',`%l4')
    54  define(`u3',`%l6')
    55  define(`v0',`%l1')
    56  define(`v1',`%l3')
    57  define(`v2',`%l5')
    58  define(`v3',`%l7')
    59  
    60  define(`cy',`%i4')
    61  
    62  define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
    63  define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
    64  
    65  ASM_START()
    66  	REGISTER(%g2,#scratch)
    67  	REGISTER(%g3,#scratch)
    68  PROLOGUE(mpn_sub_nc)
    69  	save	%sp,-160,%sp
    70  
    71  	fitod	%f0,%f0		C make sure f0 contains small, quiet number
    72  	subcc	n,4,%g0
    73  	bl,pn	%xcc,.Loop0
    74  	nop
    75  	b,a	L(com)
    76  EPILOGUE()
    77  
    78  PROLOGUE(mpn_sub_n)
    79  	save	%sp,-160,%sp
    80  
    81  	fitod	%f0,%f0		C make sure f0 contains small, quiet number
    82  	subcc	n,4,%g0
    83  	bl,pn	%xcc,.Loop0
    84  	mov	0,cy
    85  L(com):
    86  	ldx	[up+0],u0
    87  	ldx	[vp+0],v0
    88  	add	up,32,up
    89  	ldx	[up-24],u1
    90  	ldx	[vp+8],v1
    91  	add	vp,32,vp
    92  	ldx	[up-16],u2
    93  	ldx	[vp-16],v2
    94  	ldx	[up-8],u3
    95  	ldx	[vp-8],v3
    96  	subcc	n,8,n
    97  	sub	u0,v0,%g1	C main sub
    98  	sub	%g1,cy,%g5	C carry sub
    99  	orn	u0,v0,%g2
   100  	bl,pn	%xcc,.Lend4567
   101  	fanop
   102  	b,a	.Loop
   103  
   104  	.align	16
   105  C START MAIN LOOP
   106  .Loop:	orn	%g5,%g2,%g2
   107  	andn	u0,v0,%g3
   108  	ldx	[up+0],u0
   109  	fanop
   110  C --
   111  	andn	%g2,%g3,%g2
   112  	ldx	[vp+0],v0
   113  	add	up,32,up
   114  	fanop
   115  C --
   116  	srlx	%g2,63,cy
   117  	sub	u1,v1,%g1
   118  	stx	%g5,[rp+0]
   119  	fanop
   120  C --
   121  	sub	%g1,cy,%g5
   122  	orn	u1,v1,%g2
   123  	fmnop
   124  	fanop
   125  C --
   126  	orn	%g5,%g2,%g2
   127  	andn	u1,v1,%g3
   128  	ldx	[up-24],u1
   129  	fanop
   130  C --
   131  	andn	%g2,%g3,%g2
   132  	ldx	[vp+8],v1
   133  	add	vp,32,vp
   134  	fanop
   135  C --
   136  	srlx	%g2,63,cy
   137  	sub	u2,v2,%g1
   138  	stx	%g5,[rp+8]
   139  	fanop
   140  C --
   141  	sub	%g1,cy,%g5
   142  	orn	u2,v2,%g2
   143  	fmnop
   144  	fanop
   145  C --
   146  	orn	%g5,%g2,%g2
   147  	andn	u2,v2,%g3
   148  	ldx	[up-16],u2
   149  	fanop
   150  C --
   151  	andn	%g2,%g3,%g2
   152  	ldx	[vp-16],v2
   153  	add	rp,32,rp
   154  	fanop
   155  C --
   156  	srlx	%g2,63,cy
   157  	sub	u3,v3,%g1
   158  	stx	%g5,[rp-16]
   159  	fanop
   160  C --
   161  	sub	%g1,cy,%g5
   162  	orn	u3,v3,%g2
   163  	fmnop
   164  	fanop
   165  C --
   166  	orn	%g5,%g2,%g2
   167  	andn	u3,v3,%g3
   168  	ldx	[up-8],u3
   169  	fanop
   170  C --
   171  	andn	%g2,%g3,%g2
   172  	subcc	n,4,n
   173  	ldx	[vp-8],v3
   174  	fanop
   175  C --
   176  	srlx	%g2,63,cy
   177  	sub	u0,v0,%g1
   178  	stx	%g5,[rp-8]
   179  	fanop
   180  C --
   181  	sub	%g1,cy,%g5
   182  	orn	u0,v0,%g2
   183  	bge,pt	%xcc,.Loop
   184  	fanop
   185  C END MAIN LOOP
   186  .Lend4567:
   187  	orn	%g5,%g2,%g2
   188  	andn	u0,v0,%g3
   189  	andn	%g2,%g3,%g2
   190  	srlx	%g2,63,cy
   191  	sub	u1,v1,%g1
   192  	stx	%g5,[rp+0]
   193  	sub	%g1,cy,%g5
   194  	orn	u1,v1,%g2
   195  	orn	%g5,%g2,%g2
   196  	andn	u1,v1,%g3
   197  	andn	%g2,%g3,%g2
   198  	srlx	%g2,63,cy
   199  	sub	u2,v2,%g1
   200  	stx	%g5,[rp+8]
   201  	sub	%g1,cy,%g5
   202  	orn	u2,v2,%g2
   203  	orn	%g5,%g2,%g2
   204  	andn	u2,v2,%g3
   205  	andn	%g2,%g3,%g2
   206  	add	rp,32,rp
   207  	srlx	%g2,63,cy
   208  	sub	u3,v3,%g1
   209  	stx	%g5,[rp-16]
   210  	sub	%g1,cy,%g5
   211  	orn	u3,v3,%g2
   212  	orn	%g5,%g2,%g2
   213  	andn	u3,v3,%g3
   214  	andn	%g2,%g3,%g2
   215  	srlx	%g2,63,cy
   216  	stx	%g5,[rp-8]
   217  
   218  	addcc	n,4,n
   219  	bz,pn	%xcc,.Lret
   220  	fanop
   221  
   222  .Loop0:	ldx	[up],u0
   223  	add	up,8,up
   224  	ldx	[vp],v0
   225  	add	vp,8,vp
   226  	add	rp,8,rp
   227  	subcc	n,1,n
   228  	sub	u0,v0,%g1
   229  	orn	u0,v0,%g2
   230  	sub	%g1,cy,%g5
   231  	andn	u0,v0,%g3
   232  	orn	%g5,%g2,%g2
   233  	stx	%g5,[rp-8]
   234  	andn	%g2,%g3,%g2
   235  	bnz,pt	%xcc,.Loop0
   236  	srlx	%g2,63,cy
   237  
   238  .Lret:	mov	cy,%i0
   239  	ret
   240  	restore
   241  EPILOGUE(mpn_sub_n)