github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc32/v9/submul_1.asm (about)

     1  dnl  SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
     2  dnl  subtract the result from a second limb vector.
     3  
     4  dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C Algorithm: We use two floating-point multiplies per limb product, with the
    35  C invariant v operand split into two 16-bit pieces, and the u operand split
    36  C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
    37  C the integer unit.
    38  
    39  C		   cycles/limb
    40  C UltraSPARC 1&2:     6.5
    41  C UltraSPARC 3:	      ?
    42  
    43  C Possible optimizations:
    44  C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
    45  C      memory bandwidth limited, this could save 1.5 cycles/limb.
    46  C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
    47  C      it is very straightforward to unroll, using an exit branch midways.
    48  C      Unrolling would allow deeper scheduling which could improve speed for L2
    49  C      cache case.
    50  C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
    51  C      aren't sufficiently apart-scheduled with just two temp areas.
    52  C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
    53  C      could save many operations.
    54  
    55  C INPUT PARAMETERS
    56  C rp	i0
    57  C up	i1
    58  C n	i2
    59  C v	i3
    60  
    61  define(`FSIZE',224)
    62  
    63  ASM_START()
    64  PROLOGUE(mpn_submul_1)
    65  	add	%sp, -FSIZE, %sp
    66  	sethi	%hi(0xffff), %g1
    67  	srl	%o3, 16, %g2
    68  	or	%g1, %lo(0xffff), %g1
    69  	and	%o3, %g1, %g1
    70  	stx	%g1, [%sp+104]
    71  	stx	%g2, [%sp+112]
    72  	ldd	[%sp+104], %f6
    73  	ldd	[%sp+112], %f8
    74  	fxtod	%f6, %f6
    75  	fxtod	%f8, %f8
    76  	ld	[%sp+104], %f10		C zero f10
    77  
    78  	mov	0, %g3			C cy = 0
    79  
    80  define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
    81  
    82  	add	%sp, 160, %o5		C point in scratch area
    83  	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
    84  
    85  	subcc	%o2, 1, %o2
    86  	ld	[%o1], %f11		C read up[i]
    87  	add	%o1, 4, %o1		C up++
    88  	bne,pt	%icc, .L_two_or_more
    89  	fxtod	%f10, %f2
    90  
    91  	fmuld	%f2, %f8, %f16
    92  	fmuld	%f2, %f6, %f4
    93  	fdtox	%f16, %f14
    94  	fdtox	%f4, %f12
    95  	std	%f14, [%o5+16]
    96  	std	%f12, [%o5+24]
    97  	ldx	[%o5+16], %g2		C p16
    98  	ldx	[%o5+24], %g1		C p0
    99  	lduw	[%o0], %g5		C read rp[i]
   100  	b	.L1
   101  	add	%o0, -16, %o0
   102  
   103  	.align	16
   104  .L_two_or_more:
   105  	subcc	%o2, 1, %o2
   106  	ld	[%o1], %f11		C read up[i]
   107  	fmuld	%f2, %f8, %f16
   108  	fmuld	%f2, %f6, %f4
   109  	add	%o1, 4, %o1		C up++
   110  	bne,pt	%icc, .L_three_or_more
   111  	fxtod	%f10, %f2
   112  
   113  	fdtox	%f16, %f14
   114  	fdtox	%f4, %f12
   115  	std	%f14, [%o5+16]
   116  	fmuld	%f2, %f8, %f16
   117  	std	%f12, [%o5+24]
   118  	fmuld	%f2, %f6, %f4
   119  	fdtox	%f16, %f14
   120  	fdtox	%f4, %f12
   121  	std	%f14, [%o5+0]
   122  	std	%f12, [%o5+8]
   123  	lduw	[%o0], %g5		C read rp[i]
   124  	ldx	[%o5+16], %g2		C p16
   125  	ldx	[%o5+24], %g1		C p0
   126  	b	.L2
   127  	add	%o0, -12, %o0
   128  
   129  	.align	16
   130  .L_three_or_more:
   131  	subcc	%o2, 1, %o2
   132  	ld	[%o1], %f11		C read up[i]
   133  	fdtox	%f16, %f14
   134  	fdtox	%f4, %f12
   135  	std	%f14, [%o5+16]
   136  	fmuld	%f2, %f8, %f16
   137  	std	%f12, [%o5+24]
   138  	fmuld	%f2, %f6, %f4
   139  	add	%o1, 4, %o1		C up++
   140  	bne,pt	%icc, .L_four_or_more
   141  	fxtod	%f10, %f2
   142  
   143  	fdtox	%f16, %f14
   144  	fdtox	%f4, %f12
   145  	std	%f14, [%o5+0]
   146  	fmuld	%f2, %f8, %f16
   147  	std	%f12, [%o5+8]
   148  	fmuld	%f2, %f6, %f4
   149  	fdtox	%f16, %f14
   150  	ldx	[%o5+16], %g2		C p16
   151  	fdtox	%f4, %f12
   152  	ldx	[%o5+24], %g1		C p0
   153  	std	%f14, [%o5+16]
   154  	std	%f12, [%o5+24]
   155  	lduw	[%o0], %g5		C read rp[i]
   156  	b	.L3
   157  	add	%o0, -8, %o0
   158  
   159  	.align	16
   160  .L_four_or_more:
   161  	subcc	%o2, 1, %o2
   162  	ld	[%o1], %f11		C read up[i]
   163  	fdtox	%f16, %f14
   164  	fdtox	%f4, %f12
   165  	std	%f14, [%o5+0]
   166  	fmuld	%f2, %f8, %f16
   167  	std	%f12, [%o5+8]
   168  	fmuld	%f2, %f6, %f4
   169  	add	%o1, 4, %o1		C up++
   170  	bne,pt	%icc, .L_five_or_more
   171  	fxtod	%f10, %f2
   172  
   173  	fdtox	%f16, %f14
   174  	ldx	[%o5+16], %g2		C p16
   175  	fdtox	%f4, %f12
   176  	ldx	[%o5+24], %g1		C p0
   177  	std	%f14, [%o5+16]
   178  	fmuld	%f2, %f8, %f16
   179  	std	%f12, [%o5+24]
   180  	fmuld	%f2, %f6, %f4
   181  	add	%o1, 4, %o1		C up++
   182  	lduw	[%o0], %g5		C read rp[i]
   183  	b	.L4
   184  	add	%o0, -4, %o0
   185  
   186  	.align	16
   187  .L_five_or_more:
   188  	subcc	%o2, 1, %o2
   189  	ld	[%o1], %f11		C read up[i]
   190  	fdtox	%f16, %f14
   191  	ldx	[%o5+16], %g2		C p16
   192  	fdtox	%f4, %f12
   193  	ldx	[%o5+24], %g1		C p0
   194  	std	%f14, [%o5+16]
   195  	fmuld	%f2, %f8, %f16
   196  	std	%f12, [%o5+24]
   197  	fmuld	%f2, %f6, %f4
   198  	add	%o1, 4, %o1		C up++
   199  	lduw	[%o0], %g5		C read rp[i]
   200  	bne,pt	%icc, .Loop
   201  	fxtod	%f10, %f2
   202  	b,a	.L5
   203  
   204  C BEGIN MAIN LOOP
   205  	.align 16
   206  C -- 0
   207  .Loop:	sub	%g0, %g3, %g3
   208  	subcc	%o2, 1, %o2
   209  	ld	[%o1], %f11		C read up[i]
   210  	fdtox	%f16, %f14
   211  C -- 1
   212  	sllx	%g2, 16, %g4		C (p16 << 16)
   213  	add	%o0, 4, %o0		C rp++
   214  	ldx	[%o5+0], %g2		C p16
   215  	fdtox	%f4, %f12
   216  C -- 2
   217  	srl	%g3, 0, %g3		C zero most significant 32 bits
   218  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   219  	ldx	[%o5+8], %g1		C p0
   220  	fanop
   221  C -- 3
   222  	nop
   223  	add	%g3, %g4, %g4		C p += cy
   224  	std	%f14, [%o5+0]
   225  	fmuld	%f2, %f8, %f16
   226  C -- 4
   227  	nop
   228  	sub	%g5, %g4, %g4		C p += rp[i]
   229  	std	%f12, [%o5+8]
   230  	fmuld	%f2, %f6, %f4
   231  C -- 5
   232  	xor	%o5, 16, %o5		C alternate scratch variables
   233  	add	%o1, 4, %o1		C up++
   234  	stw	%g4, [%o0-4]
   235  	fanop
   236  C -- 6
   237  	srlx	%g4, 32, %g3		C new cy
   238  	lduw	[%o0], %g5		C read rp[i]
   239  	bne,pt	%icc, .Loop
   240  	fxtod	%f10, %f2
   241  C END MAIN LOOP
   242  
   243  .L5:	sub	%g0, %g3, %g3
   244  	fdtox	%f16, %f14
   245  	sllx	%g2, 16, %g4		C (p16 << 16)
   246  	ldx	[%o5+0], %g2		C p16
   247  	fdtox	%f4, %f12
   248  	srl	%g3, 0, %g3		C zero most significant 32 bits
   249  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   250  	ldx	[%o5+8], %g1		C p0
   251  	add	%g4, %g3, %g4		C p += cy
   252  	std	%f14, [%o5+0]
   253  	fmuld	%f2, %f8, %f16
   254  	sub	%g5, %g4, %g4		C p += rp[i]
   255  	std	%f12, [%o5+8]
   256  	fmuld	%f2, %f6, %f4
   257  	xor	%o5, 16, %o5
   258  	stw	%g4, [%o0+0]
   259  	srlx	%g4, 32, %g3		C new cy
   260  	lduw	[%o0+4], %g5		C read rp[i]
   261  
   262  	sub	%g0, %g3, %g3
   263  .L4:	fdtox	%f16, %f14
   264  	sllx	%g2, 16, %g4		C (p16 << 16)
   265  	ldx	[%o5+0], %g2		C p16
   266  	fdtox	%f4, %f12
   267  	srl	%g3, 0, %g3		C zero most significant 32 bits
   268  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   269  	ldx	[%o5+8], %g1		C p0
   270  	add	%g3, %g4, %g4		C p += cy
   271  	std	%f14, [%o5+0]
   272  	sub	%g5, %g4, %g4		C p += rp[i]
   273  	std	%f12, [%o5+8]
   274  	xor	%o5, 16, %o5
   275  	stw	%g4, [%o0+4]
   276  	srlx	%g4, 32, %g3		C new cy
   277  	lduw	[%o0+8], %g5		C read rp[i]
   278  
   279  	sub	%g0, %g3, %g3
   280  .L3:	sllx	%g2, 16, %g4		C (p16 << 16)
   281  	ldx	[%o5+0], %g2		C p16
   282  	srl	%g3, 0, %g3		C zero most significant 32 bits
   283  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   284  	ldx	[%o5+8], %g1		C p0
   285  	add	%g3, %g4, %g4		C p += cy
   286  	sub	%g5, %g4, %g4		C p += rp[i]
   287  	xor	%o5, 16, %o5
   288  	stw	%g4, [%o0+8]
   289  	srlx	%g4, 32, %g3		C new cy
   290  	lduw	[%o0+12], %g5		C read rp[i]
   291  
   292  	sub	%g0, %g3, %g3
   293  .L2:	sllx	%g2, 16, %g4		C (p16 << 16)
   294  	ldx	[%o5+0], %g2		C p16
   295  	srl	%g3, 0, %g3		C zero most significant 32 bits
   296  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   297  	ldx	[%o5+8], %g1		C p0
   298  	add	%g3, %g4, %g4		C p += cy
   299  	sub	%g5, %g4, %g4		C p += rp[i]
   300  	stw	%g4, [%o0+12]
   301  	srlx	%g4, 32, %g3		C new cy
   302  	lduw	[%o0+16], %g5		C read rp[i]
   303  
   304  	sub	%g0, %g3, %g3
   305  .L1:	sllx	%g2, 16, %g4		C (p16 << 16)
   306  	srl	%g3, 0, %g3		C zero most significant 32 bits
   307  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   308  	add	%g3, %g4, %g4		C p += cy
   309  	sub	%g5, %g4, %g4		C p += rp[i]
   310  	stw	%g4, [%o0+16]
   311  	srlx	%g4, 32, %g3		C new cy
   312  
   313  	sub	%g0, %g3, %o0
   314  	retl
   315  	sub	%sp, -FSIZE, %sp
   316  EPILOGUE(mpn_submul_1)