github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc32/v9/addmul_1.asm (about)

     1  dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
     2  dnl  the result to a second limb vector.
     3  
     4  dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C Algorithm: We use two floating-point multiplies per limb product, with the
    35  C invariant v operand split into two 16-bit pieces, and the u operand split
    36  C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
    37  C the integer unit.
    38  
    39  C		   cycles/limb
    40  C UltraSPARC 1&2:     6.5
    41  C UltraSPARC 3:	      ?
    42  
    43  C Possible optimizations:
    44  C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
    45  C      memory bandwidth limited, this could save 1.5 cycles/limb.
    46  C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
    47  C      it is very straightforward to unroll, using an exit branch midways.
    48  C      Unrolling would allow deeper scheduling which could improve speed for L2
    49  C      cache case.
    50  C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
    51  C      aren't sufficiently apart-scheduled with just two temp areas.
    52  C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
    53  C      could save many operations.
    54  
    55  C INPUT PARAMETERS
    56  C rp	i0
    57  C up	i1
    58  C n	i2
    59  C v	i3
    60  
    61  define(`FSIZE',224)
    62  
    63  ASM_START()
    64  PROLOGUE(mpn_addmul_1)
    65  	add	%sp, -FSIZE, %sp
    66  	sethi	%hi(0xffff), %g1
    67  	srl	%o3, 16, %g2
    68  	or	%g1, %lo(0xffff), %g1
    69  	and	%o3, %g1, %g1
    70  	stx	%g1, [%sp+104]
    71  	stx	%g2, [%sp+112]
    72  	ldd	[%sp+104], %f6
    73  	ldd	[%sp+112], %f8
    74  	fxtod	%f6, %f6
    75  	fxtod	%f8, %f8
    76  	ld	[%sp+104], %f10		C zero f10
    77  
    78  	mov	0, %g3			C cy = 0
    79  
    80  define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
    81  
    82  	add	%sp, 160, %o5		C point in scratch area
    83  	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
    84  
    85  	subcc	%o2, 1, %o2
    86  	ld	[%o1], %f11		C read up[i]
    87  	add	%o1, 4, %o1		C up++
    88  	bne,pt	%icc, .L_two_or_more
    89  	fxtod	%f10, %f2
    90  
    91  	fmuld	%f2, %f8, %f16
    92  	fmuld	%f2, %f6, %f4
    93  	fdtox	%f16, %f14
    94  	fdtox	%f4, %f12
    95  	std	%f14, [%o5+16]
    96  	std	%f12, [%o5+24]
    97  	ldx	[%o5+16], %g2		C p16
    98  	ldx	[%o5+24], %g1		C p0
    99  	lduw	[%o0], %g5		C read rp[i]
   100  	b	.L1
   101  	add	%o0, -16, %o0
   102  
   103  	.align	16
   104  .L_two_or_more:
   105  	subcc	%o2, 1, %o2
   106  	ld	[%o1], %f11		C read up[i]
   107  	fmuld	%f2, %f8, %f16
   108  	fmuld	%f2, %f6, %f4
   109  	add	%o1, 4, %o1		C up++
   110  	bne,pt	%icc, .L_three_or_more
   111  	fxtod	%f10, %f2
   112  
   113  	fdtox	%f16, %f14
   114  	fdtox	%f4, %f12
   115  	std	%f14, [%o5+16]
   116  	fmuld	%f2, %f8, %f16
   117  	std	%f12, [%o5+24]
   118  	fmuld	%f2, %f6, %f4
   119  	fdtox	%f16, %f14
   120  	fdtox	%f4, %f12
   121  	std	%f14, [%o5+0]
   122  	std	%f12, [%o5+8]
   123  	lduw	[%o0], %g5		C read rp[i]
   124  	ldx	[%o5+16], %g2		C p16
   125  	ldx	[%o5+24], %g1		C p0
   126  	b	.L2
   127  	add	%o0, -12, %o0
   128  
   129  	.align	16
   130  .L_three_or_more:
   131  	subcc	%o2, 1, %o2
   132  	ld	[%o1], %f11		C read up[i]
   133  	fdtox	%f16, %f14
   134  	fdtox	%f4, %f12
   135  	std	%f14, [%o5+16]
   136  	fmuld	%f2, %f8, %f16
   137  	std	%f12, [%o5+24]
   138  	fmuld	%f2, %f6, %f4
   139  	add	%o1, 4, %o1		C up++
   140  	bne,pt	%icc, .L_four_or_more
   141  	fxtod	%f10, %f2
   142  
   143  	fdtox	%f16, %f14
   144  	fdtox	%f4, %f12
   145  	std	%f14, [%o5+0]
   146  	fmuld	%f2, %f8, %f16
   147  	std	%f12, [%o5+8]
   148  	fmuld	%f2, %f6, %f4
   149  	fdtox	%f16, %f14
   150  	ldx	[%o5+16], %g2		C p16
   151  	fdtox	%f4, %f12
   152  	ldx	[%o5+24], %g1		C p0
   153  	std	%f14, [%o5+16]
   154  	std	%f12, [%o5+24]
   155  	lduw	[%o0], %g5		C read rp[i]
   156  	b	.L3
   157  	add	%o0, -8, %o0
   158  
   159  	.align	16
   160  .L_four_or_more:
   161  	subcc	%o2, 1, %o2
   162  	ld	[%o1], %f11		C read up[i]
   163  	fdtox	%f16, %f14
   164  	fdtox	%f4, %f12
   165  	std	%f14, [%o5+0]
   166  	fmuld	%f2, %f8, %f16
   167  	std	%f12, [%o5+8]
   168  	fmuld	%f2, %f6, %f4
   169  	add	%o1, 4, %o1		C up++
   170  	bne,pt	%icc, .L_five_or_more
   171  	fxtod	%f10, %f2
   172  
   173  	fdtox	%f16, %f14
   174  	ldx	[%o5+16], %g2		C p16
   175  	fdtox	%f4, %f12
   176  	ldx	[%o5+24], %g1		C p0
   177  	std	%f14, [%o5+16]
   178  	fmuld	%f2, %f8, %f16
   179  	std	%f12, [%o5+24]
   180  	fmuld	%f2, %f6, %f4
   181  	add	%o1, 4, %o1		C up++
   182  	lduw	[%o0], %g5		C read rp[i]
   183  	b	.L4
   184  	add	%o0, -4, %o0
   185  
   186  	.align	16
   187  .L_five_or_more:
   188  	subcc	%o2, 1, %o2
   189  	ld	[%o1], %f11		C read up[i]
   190  	fdtox	%f16, %f14
   191  	ldx	[%o5+16], %g2		C p16
   192  	fdtox	%f4, %f12
   193  	ldx	[%o5+24], %g1		C p0
   194  	std	%f14, [%o5+16]
   195  	fmuld	%f2, %f8, %f16
   196  	std	%f12, [%o5+24]
   197  	fmuld	%f2, %f6, %f4
   198  	add	%o1, 4, %o1		C up++
   199  	lduw	[%o0], %g5		C read rp[i]
   200  	bne,pt	%icc, .Loop
   201  	fxtod	%f10, %f2
   202  	b,a	.L5
   203  
   204  C BEGIN MAIN LOOP
   205  	.align 16
   206  C -- 0
   207  .Loop:	nop
   208  	subcc	%o2, 1, %o2
   209  	ld	[%o1], %f11		C read up[i]
   210  	fdtox	%f16, %f14
   211  C -- 1
   212  	sllx	%g2, 16, %g4		C (p16 << 16)
   213  	add	%o0, 4, %o0		C rp++
   214  	ldx	[%o5+0], %g2		C p16
   215  	fdtox	%f4, %f12
   216  C -- 2
   217  	nop
   218  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   219  	ldx	[%o5+8], %g1		C p0
   220  	fanop
   221  C -- 3
   222  	nop
   223  	add	%g3, %g4, %g4		C p += cy
   224  	std	%f14, [%o5+0]
   225  	fmuld	%f2, %f8, %f16
   226  C -- 4
   227  	nop
   228  	add	%g5, %g4, %g4		C p += rp[i]
   229  	std	%f12, [%o5+8]
   230  	fmuld	%f2, %f6, %f4
   231  C -- 5
   232  	xor	%o5, 16, %o5		C alternate scratch variables
   233  	add	%o1, 4, %o1		C up++
   234  	stw	%g4, [%o0-4]
   235  	fanop
   236  C -- 6
   237  	srlx	%g4, 32, %g3		C new cy
   238  	lduw	[%o0], %g5		C read rp[i]
   239  	bne,pt	%icc, .Loop
   240  	fxtod	%f10, %f2
   241  C END MAIN LOOP
   242  
   243  .L5:	fdtox	%f16, %f14
   244  	sllx	%g2, 16, %g4		C (p16 << 16)
   245  	ldx	[%o5+0], %g2		C p16
   246  	fdtox	%f4, %f12
   247  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   248  	ldx	[%o5+8], %g1		C p0
   249  	add	%g4, %g3, %g4		C p += cy
   250  	std	%f14, [%o5+0]
   251  	fmuld	%f2, %f8, %f16
   252  	add	%g5, %g4, %g4		C p += rp[i]
   253  	std	%f12, [%o5+8]
   254  	fmuld	%f2, %f6, %f4
   255  	xor	%o5, 16, %o5
   256  	stw	%g4, [%o0+0]
   257  	srlx	%g4, 32, %g3		C new cy
   258  	lduw	[%o0+4], %g5		C read rp[i]
   259  
   260  .L4:	fdtox	%f16, %f14
   261  	sllx	%g2, 16, %g4		C (p16 << 16)
   262  	ldx	[%o5+0], %g2		C p16
   263  	fdtox	%f4, %f12
   264  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   265  	ldx	[%o5+8], %g1		C p0
   266  	add	%g3, %g4, %g4		C p += cy
   267  	std	%f14, [%o5+0]
   268  	add	%g5, %g4, %g4		C p += rp[i]
   269  	std	%f12, [%o5+8]
   270  	xor	%o5, 16, %o5
   271  	stw	%g4, [%o0+4]
   272  	srlx	%g4, 32, %g3		C new cy
   273  	lduw	[%o0+8], %g5		C read rp[i]
   274  
   275  .L3:	sllx	%g2, 16, %g4		C (p16 << 16)
   276  	ldx	[%o5+0], %g2		C p16
   277  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   278  	ldx	[%o5+8], %g1		C p0
   279  	add	%g3, %g4, %g4		C p += cy
   280  	add	%g5, %g4, %g4		C p += rp[i]
   281  	xor	%o5, 16, %o5
   282  	stw	%g4, [%o0+8]
   283  	srlx	%g4, 32, %g3		C new cy
   284  	lduw	[%o0+12], %g5		C read rp[i]
   285  
   286  .L2:	sllx	%g2, 16, %g4		C (p16 << 16)
   287  	ldx	[%o5+0], %g2		C p16
   288  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   289  	ldx	[%o5+8], %g1		C p0
   290  	add	%g3, %g4, %g4		C p += cy
   291  	add	%g5, %g4, %g4		C p += rp[i]
   292  	stw	%g4, [%o0+12]
   293  	srlx	%g4, 32, %g3		C new cy
   294  	lduw	[%o0+16], %g5		C read rp[i]
   295  
   296  .L1:	sllx	%g2, 16, %g4		C (p16 << 16)
   297  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   298  	add	%g3, %g4, %g4		C p += cy
   299  	add	%g5, %g4, %g4		C p += rp[i]
   300  	stw	%g4, [%o0+16]
   301  	srlx	%g4, 32, %g3		C new cy
   302  
   303  	mov	%g3, %o0
   304  	retl
   305  	sub	%sp, -FSIZE, %sp
   306  EPILOGUE(mpn_addmul_1)