github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc64/ultrasparc1234/addmul_2.asm (about)

     1  dnl  SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
     2  dnl  number and add the result to a n limb vector.
     3  
     4  dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C                  cycles/limb
    35  C UltraSPARC 1&2:      9
    36  C UltraSPARC 3:       10
    37  
    38  C Algorithm: We use 16 floating-point multiplies per limb product, with the
    39  C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
    40  C split into 32-bit pieces.  We sum four 48-bit partial products using
    41  C floating-point add, then convert the resulting four 50-bit quantities and
    42  C transfer them to the integer unit.
    43  
    44  C Possible optimizations:
    45  C   1. Align the stack area where we transfer the four 50-bit product-sums
    46  C      to a 32-byte boundary.  That would minimize the cache collision.
    47  C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
    48  C      be to align the area to map to the area immediately before up?)
    49  C   2. Perform two of the fp->int conversions with integer instructions.  We
    50  C      can get almost ten free IEU slots, if we clean up bookkeeping and the
    51  C      silly carry-limb code.
    52  C   3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
    53  C      code.
    54  
    55  C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
    56  C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
    57  C FI	= 20
    58  C L	=  9 x un * vn
    59  C WDFI	= 10 x vn / 2
    60  C WD	= 4
    61  
    62  C Instruction classification (as per UltraSPARC functional units).
    63  C Assuming silly carry code is fixed.  Includes bookkeeping.
    64  C
    65  C               mpn_addmul_X     mpn_mul_X
    66  C                1       2       1       2
    67  C               ==========      ==========
    68  C      FM        8      16       8      16
    69  C      FA       10      18      10      18
    70  C     MEM       12      12      10      10
    71  C  ISHIFT        6       6       6       6
    72  C IADDLOG       11      11      10      10
    73  C  BRANCH        1       1       1       1
    74  C
    75  C TOTAL IEU     17      17      16      16
    76  C TOTAL         48      64      45      61
    77  C
    78  C IEU cycles     8.5     8.5     8       8
    79  C MEM cycles    12      12      10      10
    80  C ISSUE cycles  12      16      11.25   15.25
    81  C FPU cycles    10      18      10      18
    82  C cycles/loop   12      18      12      18
    83  C cycles/limb   12       9      12       9
    84  
    85  
    86  C INPUT PARAMETERS
    87  C rp[n + 1]	i0
    88  C up[n]		i1
    89  C n		i2
    90  C vp[2]		i3
    91  
    92  
    93  ASM_START()
    94  	REGISTER(%g2,#scratch)
    95  	REGISTER(%g3,#scratch)
    96  
    97  C Combine registers:
    98  C u00_hi= u32_hi
    99  C u00_lo= u32_lo
   100  C a000  = out000
   101  C a016  = out016
   102  C Free: f52 f54
   103  
   104  
   105  define(`p000', `%f8')  define(`p016',`%f10')
   106  define(`p032',`%f12')  define(`p048',`%f14')
   107  define(`p064',`%f16')  define(`p080',`%f18')
   108  define(`p096a',`%f20') define(`p112a',`%f22')
   109  define(`p096b',`%f56') define(`p112b',`%f58')
   110  
   111  define(`out000',`%f0') define(`out016',`%f6')
   112  
   113  define(`v000',`%f24')  define(`v016',`%f26')
   114  define(`v032',`%f28')  define(`v048',`%f30')
   115  define(`v064',`%f44')  define(`v080',`%f46')
   116  define(`v096',`%f48')  define(`v112',`%f50')
   117  
   118  define(`u00',`%f32')   define(`u32', `%f34')
   119  
   120  define(`a000',`%f36')  define(`a016',`%f38')
   121  define(`a032',`%f40')  define(`a048',`%f42')
   122  define(`a064',`%f60')  define(`a080',`%f62')
   123  
   124  define(`u00_hi',`%f2') define(`u32_hi',`%f4')
   125  define(`u00_lo',`%f3') define(`u32_lo',`%f5')
   126  
   127  define(`cy',`%g1')
   128  define(`rlimb',`%g3')
   129  define(`i00',`%l0')    define(`i16',`%l1')
   130  define(`r00',`%l2')    define(`r32',`%l3')
   131  define(`xffffffff',`%l7')
   132  define(`xffff',`%o0')
   133  
   134  
   135  PROLOGUE(mpn_addmul_2)
   136  
   137  C Initialization.  (1) Split v operand into eight 16-bit chunks and store them
   138  C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
   139  C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
   140  C This code could be better scheduled.
   141  
   142  	save	%sp, -256, %sp
   143  
   144  ifdef(`HAVE_VIS',
   145  `	mov	-1, %g4
   146  	wr	%g0, 0xD2, %asi
   147  	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
   148  	ldda	[%i3+6] %asi, v000
   149  	ldda	[%i3+4] %asi, v016
   150  	ldda	[%i3+2] %asi, v032
   151  	ldda	[%i3+0] %asi, v048
   152  	fxtod	v000, v000
   153  	ldda	[%i3+14] %asi, v064
   154  	fxtod	v016, v016
   155  	ldda	[%i3+12] %asi, v080
   156  	fxtod	v032, v032
   157  	ldda	[%i3+10] %asi, v096
   158  	fxtod	v048, v048
   159  	ldda	[%i3+8] %asi, v112
   160  	fxtod	v064, v064
   161  	fxtod	v080, v080
   162  	fxtod	v096, v096
   163  	fxtod	v112, v112
   164  	fzero	u00_hi
   165  	fzero	u32_hi
   166  ',
   167  `	mov	-1, %g4
   168  	ldx	[%i3+0], %l0		C vp[0]
   169  	srlx	%g4, 48, xffff		C store mask in register `xffff'
   170  	ldx	[%i3+8], %l1		C vp[1]
   171  
   172  	and	%l0, xffff, %g2
   173  	stx	%g2, [%sp+2223+0]
   174  	srlx	%l0, 16, %g3
   175  	and	%g3, xffff, %g3
   176  	stx	%g3, [%sp+2223+8]
   177  	srlx	%l0, 32, %g2
   178  	and	%g2, xffff, %g2
   179  	stx	%g2, [%sp+2223+16]
   180  	srlx	%l0, 48, %g3
   181  	stx	%g3, [%sp+2223+24]
   182  	and	%l1, xffff, %g2
   183  	stx	%g2, [%sp+2223+32]
   184  	srlx	%l1, 16, %g3
   185  	and	%g3, xffff, %g3
   186  	stx	%g3, [%sp+2223+40]
   187  	srlx	%l1, 32, %g2
   188  	and	%g2, xffff, %g2
   189  	stx	%g2, [%sp+2223+48]
   190  	srlx	%l1, 48, %g3
   191  	stx	%g3, [%sp+2223+56]
   192  
   193  	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
   194  
   195  	ldd	[%sp+2223+0], v000
   196  	ldd	[%sp+2223+8], v016
   197  	ldd	[%sp+2223+16], v032
   198  	ldd	[%sp+2223+24], v048
   199  	fxtod	v000, v000
   200  	ldd	[%sp+2223+32], v064
   201  	fxtod	v016, v016
   202  	ldd	[%sp+2223+40], v080
   203  	fxtod	v032, v032
   204  	ldd	[%sp+2223+48], v096
   205  	fxtod	v048, v048
   206  	ldd	[%sp+2223+56], v112
   207  	fxtod	v064, v064
   208  	ld	[%sp+2223+0], u00_hi	C zero u00_hi
   209  	fxtod	v080, v080
   210  	ld	[%sp+2223+0], u32_hi	C zero u32_hi
   211  	fxtod	v096, v096
   212  	fxtod	v112, v112
   213  ')
   214  C Initialization done.
   215  	mov	0, %g2
   216  	mov	0, rlimb
   217  	mov	0, %g4
   218  	add	%i0, -8, %i0		C BOOKKEEPING
   219  
   220  C Start software pipeline.
   221  
   222  	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
   223  	fxtod	u00_hi, u00
   224  C mid
   225  	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
   226  	fmuld	u00, v000, a000
   227  	fmuld	u00, v016, a016
   228  	fmuld	u00, v032, a032
   229  	fmuld	u00, v048, a048
   230  	add	%i2, -1, %i2		C BOOKKEEPING
   231  	fmuld	u00, v064, p064
   232  	add	%i1, 8, %i1		C BOOKKEEPING
   233  	fxtod	u32_hi, u32
   234  	fmuld	u00, v080, p080
   235  	fmuld	u00, v096, p096a
   236  	brnz,pt	%i2, .L_2_or_more
   237  	 fmuld	u00, v112, p112a
   238  
   239  .L1:	fdtox	a000, out000
   240  	fmuld	u32, v000, p000
   241  	fdtox	a016, out016
   242  	fmuld	u32, v016, p016
   243  	fmovd	p064, a064
   244  	fmuld	u32, v032, p032
   245  	fmovd	p080, a080
   246  	fmuld	u32, v048, p048
   247  	std	out000, [%sp+2223+16]
   248  	faddd	p000, a032, a000
   249  	fmuld	u32, v064, p064
   250  	std	out016, [%sp+2223+24]
   251  	fxtod	u00_hi, u00
   252  	faddd	p016, a048, a016
   253  	fmuld	u32, v080, p080
   254  	faddd	p032, a064, a032
   255  	fmuld	u32, v096, p096b
   256  	faddd	p048, a080, a048
   257  	fmuld	u32, v112, p112b
   258  C mid
   259  	fdtox	a000, out000
   260  	fdtox	a016, out016
   261  	faddd	p064, p096a, a064
   262  	faddd	p080, p112a, a080
   263  	std	out000, [%sp+2223+0]
   264  	b	.L_wd2
   265  	 std	out016, [%sp+2223+8]
   266  
   267  .L_2_or_more:
   268  	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
   269  	fdtox	a000, out000
   270  	fmuld	u32, v000, p000
   271  	fdtox	a016, out016
   272  	fmuld	u32, v016, p016
   273  	fmovd	p064, a064
   274  	fmuld	u32, v032, p032
   275  	fmovd	p080, a080
   276  	fmuld	u32, v048, p048
   277  	std	out000, [%sp+2223+16]
   278  	faddd	p000, a032, a000
   279  	fmuld	u32, v064, p064
   280  	std	out016, [%sp+2223+24]
   281  	fxtod	u00_hi, u00
   282  	faddd	p016, a048, a016
   283  	fmuld	u32, v080, p080
   284  	faddd	p032, a064, a032
   285  	fmuld	u32, v096, p096b
   286  	faddd	p048, a080, a048
   287  	fmuld	u32, v112, p112b
   288  C mid
   289  	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
   290  	fdtox	a000, out000
   291  	fmuld	u00, v000, p000
   292  	fdtox	a016, out016
   293  	fmuld	u00, v016, p016
   294  	faddd	p064, p096a, a064
   295  	fmuld	u00, v032, p032
   296  	faddd	p080, p112a, a080
   297  	fmuld	u00, v048, p048
   298  	add	%i2, -1, %i2		C BOOKKEEPING
   299  	std	out000, [%sp+2223+0]
   300  	faddd	p000, a032, a000
   301  	fmuld	u00, v064, p064
   302  	add	%i1, 8, %i1		C BOOKKEEPING
   303  	std	out016, [%sp+2223+8]
   304  	fxtod	u32_hi, u32
   305  	faddd	p016, a048, a016
   306  	fmuld	u00, v080, p080
   307  	faddd	p032, a064, a032
   308  	fmuld	u00, v096, p096a
   309  	faddd	p048, a080, a048
   310  	brnz,pt	%i2, .L_3_or_more
   311  	 fmuld	u00, v112, p112a
   312  
   313  	b	.Lend
   314  	 nop
   315  
   316  C  64      32       0
   317  C   .       .       .
   318  C   .       |__rXXX_|	32
   319  C   .      |___cy___|	34
   320  C   .  |_______i00__|	50
   321  C  |_______i16__|   .	50
   322  
   323  
   324  C BEGIN MAIN LOOP
   325  	.align	16
   326  .L_3_or_more:
   327  .Loop:	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
   328  	and	%g2, xffffffff, %g2
   329  	fdtox	a000, out000
   330  	fmuld	u32, v000, p000
   331  C
   332  	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
   333  	add	%g2, rlimb, %l5
   334  	fdtox	a016, out016
   335  	fmuld	u32, v016, p016
   336  C
   337  	srlx	%l5, 32, cy
   338  	ldx	[%sp+2223+16], i00
   339  	faddd	p064, p096b, a064
   340  	fmuld	u32, v032, p032
   341  C
   342  	add	%g4, cy, cy		C new cy
   343  	ldx	[%sp+2223+24], i16
   344  	faddd	p080, p112b, a080
   345  	fmuld	u32, v048, p048
   346  C
   347  	nop
   348  	std	out000, [%sp+2223+16]
   349  	faddd	p000, a032, a000
   350  	fmuld	u32, v064, p064
   351  C
   352  	add	i00, r00, rlimb
   353  	add	%i0, 8, %i0		C BOOKKEEPING
   354  	std	out016, [%sp+2223+24]
   355  	fxtod	u00_hi, u00
   356  C
   357  	sllx	i16, 16, %g2
   358  	add	cy, rlimb, rlimb
   359  	faddd	p016, a048, a016
   360  	fmuld	u32, v080, p080
   361  C
   362  	srlx	i16, 16, %g4
   363  	add	%g2, rlimb, %l5
   364  	faddd	p032, a064, a032
   365  	fmuld	u32, v096, p096b
   366  C
   367  	stw	%l5, [%i0+4]
   368  	nop
   369  	faddd	p048, a080, a048
   370  	fmuld	u32, v112, p112b
   371  C midloop
   372  	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
   373  	and	%g2, xffffffff, %g2
   374  	fdtox	a000, out000
   375  	fmuld	u00, v000, p000
   376  C
   377  	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
   378  	add	%g2, rlimb, %l5
   379  	fdtox	a016, out016
   380  	fmuld	u00, v016, p016
   381  C
   382  	srlx	%l5, 32, cy
   383  	ldx	[%sp+2223+0], i00
   384  	faddd	p064, p096a, a064
   385  	fmuld	u00, v032, p032
   386  C
   387  	add	%g4, cy, cy		C new cy
   388  	ldx	[%sp+2223+8], i16
   389  	faddd	p080, p112a, a080
   390  	fmuld	u00, v048, p048
   391  C
   392  	add	%i2, -1, %i2		C BOOKKEEPING
   393  	std	out000, [%sp+2223+0]
   394  	faddd	p000, a032, a000
   395  	fmuld	u00, v064, p064
   396  C
   397  	add	i00, r32, rlimb
   398  	add	%i1, 8, %i1		C BOOKKEEPING
   399  	std	out016, [%sp+2223+8]
   400  	fxtod	u32_hi, u32
   401  C
   402  	sllx	i16, 16, %g2
   403  	add	cy, rlimb, rlimb
   404  	faddd	p016, a048, a016
   405  	fmuld	u00, v080, p080
   406  C
   407  	srlx	i16, 16, %g4
   408  	add	%g2, rlimb, %l5
   409  	faddd	p032, a064, a032
   410  	fmuld	u00, v096, p096a
   411  C
   412  	stw	%l5, [%i0+0]
   413  	faddd	p048, a080, a048
   414  	brnz,pt	%i2, .Loop
   415  	 fmuld	u00, v112, p112a
   416  C END MAIN LOOP
   417  
   418  C WIND-DOWN PHASE 1
   419  .Lend:	and	%g2, xffffffff, %g2
   420  	fdtox	a000, out000
   421  	fmuld	u32, v000, p000
   422  	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
   423  	add	%g2, rlimb, %l5
   424  	fdtox	a016, out016
   425  	fmuld	u32, v016, p016
   426  	srlx	%l5, 32, cy
   427  	ldx	[%sp+2223+16], i00
   428  	faddd	p064, p096b, a064
   429  	fmuld	u32, v032, p032
   430  	add	%g4, cy, cy		C new cy
   431  	ldx	[%sp+2223+24], i16
   432  	faddd	p080, p112b, a080
   433  	fmuld	u32, v048, p048
   434  	std	out000, [%sp+2223+16]
   435  	faddd	p000, a032, a000
   436  	fmuld	u32, v064, p064
   437  	add	i00, r00, rlimb
   438  	add	%i0, 8, %i0		C BOOKKEEPING
   439  	std	out016, [%sp+2223+24]
   440  	sllx	i16, 16, %g2
   441  	add	cy, rlimb, rlimb
   442  	faddd	p016, a048, a016
   443  	fmuld	u32, v080, p080
   444  	srlx	i16, 16, %g4
   445  	add	%g2, rlimb, %l5
   446  	faddd	p032, a064, a032
   447  	fmuld	u32, v096, p096b
   448  	stw	%l5, [%i0+4]
   449  	faddd	p048, a080, a048
   450  	fmuld	u32, v112, p112b
   451  C mid
   452  	and	%g2, xffffffff, %g2
   453  	fdtox	a000, out000
   454  	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
   455  	add	%g2, rlimb, %l5
   456  	fdtox	a016, out016
   457  	srlx	%l5, 32, cy
   458  	ldx	[%sp+2223+0], i00
   459  	faddd	p064, p096a, a064
   460  	add	%g4, cy, cy		C new cy
   461  	ldx	[%sp+2223+8], i16
   462  	faddd	p080, p112a, a080
   463  	std	out000, [%sp+2223+0]
   464  	add	i00, r32, rlimb
   465  	std	out016, [%sp+2223+8]
   466  	sllx	i16, 16, %g2
   467  	add	cy, rlimb, rlimb
   468  	srlx	i16, 16, %g4
   469  	add	%g2, rlimb, %l5
   470  	stw	%l5, [%i0+0]
   471  
   472  C WIND-DOWN PHASE 2
   473  .L_wd2:	and	%g2, xffffffff, %g2
   474  	fdtox	a032, out000
   475  	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
   476  	add	%g2, rlimb, %l5
   477  	fdtox	a048, out016
   478  	srlx	%l5, 32, cy
   479  	ldx	[%sp+2223+16], i00
   480  	add	%g4, cy, cy		C new cy
   481  	ldx	[%sp+2223+24], i16
   482  	std	out000, [%sp+2223+16]
   483  	add	i00, r00, rlimb
   484  	add	%i0, 8, %i0		C BOOKKEEPING
   485  	std	out016, [%sp+2223+24]
   486  	sllx	i16, 16, %g2
   487  	add	cy, rlimb, rlimb
   488  	srlx	i16, 16, %g4
   489  	add	%g2, rlimb, %l5
   490  	stw	%l5, [%i0+4]
   491  C mid
   492  	and	%g2, xffffffff, %g2
   493  	fdtox	a064, out000
   494  	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
   495  	add	%g2, rlimb, %l5
   496  	fdtox	a080, out016
   497  	srlx	%l5, 32, cy
   498  	ldx	[%sp+2223+0], i00
   499  	add	%g4, cy, cy		C new cy
   500  	ldx	[%sp+2223+8], i16
   501  	std	out000, [%sp+2223+0]
   502  	add	i00, r32, rlimb
   503  	std	out016, [%sp+2223+8]
   504  	sllx	i16, 16, %g2
   505  	add	cy, rlimb, rlimb
   506  	srlx	i16, 16, %g4
   507  	add	%g2, rlimb, %l5
   508  	stw	%l5, [%i0+0]
   509  
   510  C WIND-DOWN PHASE 3
   511  .L_wd3:	and	%g2, xffffffff, %g2
   512  	fdtox	p096b, out000
   513  	add	%g2, rlimb, %l5
   514  	fdtox	p112b, out016
   515  	srlx	%l5, 32, cy
   516  	ldx	[%sp+2223+16], rlimb
   517  	add	%g4, cy, cy		C new cy
   518  	ldx	[%sp+2223+24], i16
   519  	std	out000, [%sp+2223+16]
   520  	add	%i0, 8, %i0		C BOOKKEEPING
   521  	std	out016, [%sp+2223+24]
   522  	sllx	i16, 16, %g2
   523  	add	cy, rlimb, rlimb
   524  	srlx	i16, 16, %g4
   525  	add	%g2, rlimb, %l5
   526  	stw	%l5, [%i0+4]
   527  C mid
   528  	and	%g2, xffffffff, %g2
   529  	add	%g2, rlimb, %l5
   530  	srlx	%l5, 32, cy
   531  	ldx	[%sp+2223+0], rlimb
   532  	add	%g4, cy, cy		C new cy
   533  	ldx	[%sp+2223+8], i16
   534  	sllx	i16, 16, %g2
   535  	add	cy, rlimb, rlimb
   536  	srlx	i16, 16, %g4
   537  	add	%g2, rlimb, %l5
   538  	stw	%l5, [%i0+0]
   539  
   540  	and	%g2, xffffffff, %g2
   541  	add	%g2, rlimb, %l5
   542  	srlx	%l5, 32, cy
   543  	ldx	[%sp+2223+16], i00
   544  	add	%g4, cy, cy		C new cy
   545  	ldx	[%sp+2223+24], i16
   546  
   547  	sllx	i16, 16, %g2
   548  	add	i00, cy, cy
   549  	return	%i7+8
   550  	add	%g2, cy, %o0
   551  EPILOGUE(mpn_addmul_2)