github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc32/v9/mul_1.asm (about)

     1  dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
     2  dnl  the result in a second limb vector.
     3  
     4  dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C Algorithm: We use two floating-point multiplies per limb product, with the
    35  C invariant v operand split into two 16-bit pieces, and the u operand split
    36  C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
    37  C the integer unit.
    38  
    39  C		   cycles/limb
    40  C UltraSPARC 1&2:     6.5
    41  C UltraSPARC 3:	      ?
    42  
    43  C Possible optimizations:
    44  C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
    45  C      memory bandwidth limited, this could save 1.5 cycles/limb.
    46  C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
    47  C      it is very straightforward to unroll, using an exit branch midways.
    48  C      Unrolling would allow deeper scheduling which could improve speed for L2
    49  C      cache case.
    50  C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
    51  C      aren't sufficiently apart-scheduled with just two temp areas.
    52  C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
    53  C      could save many operations.
    54  
    55  C INPUT PARAMETERS
    56  C rp	i0
    57  C up	i1
    58  C n	i2
    59  C v	i3
    60  
    61  define(`FSIZE',224)
    62  
    63  ASM_START()
    64  PROLOGUE(mpn_mul_1)
    65  	add	%sp, -FSIZE, %sp
    66  	sethi	%hi(0xffff), %g1
    67  	srl	%o3, 16, %g2
    68  	or	%g1, %lo(0xffff), %g1
    69  	and	%o3, %g1, %g1
    70  	stx	%g1, [%sp+104]
    71  	stx	%g2, [%sp+112]
    72  	ldd	[%sp+104], %f6
    73  	ldd	[%sp+112], %f8
    74  	fxtod	%f6, %f6
    75  	fxtod	%f8, %f8
    76  	ld	[%sp+104], %f10		C zero f10
    77  
    78  	mov	0, %g3			C cy = 0
    79  
    80  define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
    81  
    82  	add	%sp, 160, %o5		C point in scratch area
    83  	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
    84  
    85  	subcc	%o2, 1, %o2
    86  	ld	[%o1], %f11		C read up[i]
    87  	add	%o1, 4, %o1		C up++
    88  	bne,pt	%icc, .L_two_or_more
    89  	fxtod	%f10, %f2
    90  
    91  	fmuld	%f2, %f8, %f16
    92  	fmuld	%f2, %f6, %f4
    93  	fdtox	%f16, %f14
    94  	fdtox	%f4, %f12
    95  	std	%f14, [%o5+16]
    96  	std	%f12, [%o5+24]
    97  	ldx	[%o5+16], %g2		C p16
    98  	ldx	[%o5+24], %g1		C p0
    99  	b	.L1
   100  	add	%o0, -16, %o0
   101  
   102  	.align	16
   103  .L_two_or_more:
   104  	subcc	%o2, 1, %o2
   105  	ld	[%o1], %f11		C read up[i]
   106  	fmuld	%f2, %f8, %f16
   107  	fmuld	%f2, %f6, %f4
   108  	add	%o1, 4, %o1		C up++
   109  	bne,pt	%icc, .L_three_or_more
   110  	fxtod	%f10, %f2
   111  
   112  	fdtox	%f16, %f14
   113  	fdtox	%f4, %f12
   114  	std	%f14, [%o5+16]
   115  	fmuld	%f2, %f8, %f16
   116  	std	%f12, [%o5+24]
   117  	fmuld	%f2, %f6, %f4
   118  	fdtox	%f16, %f14
   119  	fdtox	%f4, %f12
   120  	std	%f14, [%o5+0]
   121  	std	%f12, [%o5+8]
   122  	ldx	[%o5+16], %g2		C p16
   123  	ldx	[%o5+24], %g1		C p0
   124  	b	.L2
   125  	add	%o0, -12, %o0
   126  
   127  	.align	16
   128  .L_three_or_more:
   129  	subcc	%o2, 1, %o2
   130  	ld	[%o1], %f11		C read up[i]
   131  	fdtox	%f16, %f14
   132  	fdtox	%f4, %f12
   133  	std	%f14, [%o5+16]
   134  	fmuld	%f2, %f8, %f16
   135  	std	%f12, [%o5+24]
   136  	fmuld	%f2, %f6, %f4
   137  	add	%o1, 4, %o1		C up++
   138  	bne,pt	%icc, .L_four_or_more
   139  	fxtod	%f10, %f2
   140  
   141  	fdtox	%f16, %f14
   142  	fdtox	%f4, %f12
   143  	std	%f14, [%o5+0]
   144  	fmuld	%f2, %f8, %f16
   145  	std	%f12, [%o5+8]
   146  	fmuld	%f2, %f6, %f4
   147  	fdtox	%f16, %f14
   148  	ldx	[%o5+16], %g2		C p16
   149  	fdtox	%f4, %f12
   150  	ldx	[%o5+24], %g1		C p0
   151  	std	%f14, [%o5+16]
   152  	std	%f12, [%o5+24]
   153  	b	.L3
   154  	add	%o0, -8, %o0
   155  
   156  	.align	16
   157  .L_four_or_more:
   158  	subcc	%o2, 1, %o2
   159  	ld	[%o1], %f11		C read up[i]
   160  	fdtox	%f16, %f14
   161  	fdtox	%f4, %f12
   162  	std	%f14, [%o5+0]
   163  	fmuld	%f2, %f8, %f16
   164  	std	%f12, [%o5+8]
   165  	fmuld	%f2, %f6, %f4
   166  	add	%o1, 4, %o1		C up++
   167  	bne,pt	%icc, .L_five_or_more
   168  	fxtod	%f10, %f2
   169  
   170  	fdtox	%f16, %f14
   171  	ldx	[%o5+16], %g2		C p16
   172  	fdtox	%f4, %f12
   173  	ldx	[%o5+24], %g1		C p0
   174  	std	%f14, [%o5+16]
   175  	fmuld	%f2, %f8, %f16
   176  	std	%f12, [%o5+24]
   177  	fmuld	%f2, %f6, %f4
   178  	add	%o1, 4, %o1		C up++
   179  	b	.L4
   180  	add	%o0, -4, %o0
   181  
   182  	.align	16
   183  .L_five_or_more:
   184  	subcc	%o2, 1, %o2
   185  	ld	[%o1], %f11		C read up[i]
   186  	fdtox	%f16, %f14
   187  	ldx	[%o5+16], %g2		C p16
   188  	fdtox	%f4, %f12
   189  	ldx	[%o5+24], %g1		C p0
   190  	std	%f14, [%o5+16]
   191  	fmuld	%f2, %f8, %f16
   192  	std	%f12, [%o5+24]
   193  	fmuld	%f2, %f6, %f4
   194  	add	%o1, 4, %o1		C up++
   195  	bne,pt	%icc, .Loop
   196  	fxtod	%f10, %f2
   197  	b,a	.L5
   198  
   199  C BEGIN MAIN LOOP
   200  	.align 16
   201  C -- 0
   202  .Loop:	nop
   203  	subcc	%o2, 1, %o2
   204  	ld	[%o1], %f11		C read up[i]
   205  	fdtox	%f16, %f14
   206  C -- 1
   207  	sllx	%g2, 16, %g4		C (p16 << 16)
   208  	add	%o0, 4, %o0		C rp++
   209  	ldx	[%o5+0], %g2		C p16
   210  	fdtox	%f4, %f12
   211  C -- 2
   212  	nop
   213  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   214  	ldx	[%o5+8], %g1		C p0
   215  	fanop
   216  C -- 3
   217  	nop
   218  	add	%g3, %g4, %g4		C p += cy
   219  	std	%f14, [%o5+0]
   220  	fmuld	%f2, %f8, %f16
   221  C -- 4
   222  	srlx	%g4, 32, %g3		C new cy
   223  	add	%o1, 4, %o1		C up++
   224  	std	%f12, [%o5+8]
   225  	fmuld	%f2, %f6, %f4
   226  C -- 5
   227  	xor	%o5, 16, %o5		C alternate scratch variables
   228  	stw	%g4, [%o0-4]
   229  	bne,pt	%icc, .Loop
   230  	fxtod	%f10, %f2
   231  C END MAIN LOOP
   232  
   233  .L5:	fdtox	%f16, %f14
   234  	sllx	%g2, 16, %g4		C (p16 << 16)
   235  	ldx	[%o5+0], %g2		C p16
   236  	fdtox	%f4, %f12
   237  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   238  	ldx	[%o5+8], %g1		C p0
   239  	add	%g4, %g3, %g4		C p += cy
   240  	std	%f14, [%o5+0]
   241  	fmuld	%f2, %f8, %f16
   242  	std	%f12, [%o5+8]
   243  	fmuld	%f2, %f6, %f4
   244  	xor	%o5, 16, %o5
   245  	stw	%g4, [%o0+0]
   246  	srlx	%g4, 32, %g3		C new cy
   247  
   248  .L4:	fdtox	%f16, %f14
   249  	sllx	%g2, 16, %g4		C (p16 << 16)
   250  	ldx	[%o5+0], %g2		C p16
   251  	fdtox	%f4, %f12
   252  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   253  	ldx	[%o5+8], %g1		C p0
   254  	add	%g3, %g4, %g4		C p += cy
   255  	std	%f14, [%o5+0]
   256  	std	%f12, [%o5+8]
   257  	xor	%o5, 16, %o5
   258  	stw	%g4, [%o0+4]
   259  	srlx	%g4, 32, %g3		C new cy
   260  
   261  .L3:	sllx	%g2, 16, %g4		C (p16 << 16)
   262  	ldx	[%o5+0], %g2		C p16
   263  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   264  	ldx	[%o5+8], %g1		C p0
   265  	add	%g3, %g4, %g4		C p += cy
   266  	xor	%o5, 16, %o5
   267  	stw	%g4, [%o0+8]
   268  	srlx	%g4, 32, %g3		C new cy
   269  
   270  .L2:	sllx	%g2, 16, %g4		C (p16 << 16)
   271  	ldx	[%o5+0], %g2		C p16
   272  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   273  	ldx	[%o5+8], %g1		C p0
   274  	add	%g3, %g4, %g4		C p += cy
   275  	stw	%g4, [%o0+12]
   276  	srlx	%g4, 32, %g3		C new cy
   277  
   278  .L1:	sllx	%g2, 16, %g4		C (p16 << 16)
   279  	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
   280  	add	%g3, %g4, %g4		C p += cy
   281  	stw	%g4, [%o0+16]
   282  	srlx	%g4, 32, %g3		C new cy
   283  
   284  	mov	%g3, %o0
   285  	retl
   286  	sub	%sp, -FSIZE, %sp
   287  EPILOGUE(mpn_mul_1)