github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc64/ultrasparct3/aormul_2.asm (about)

     1  dnl  SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C		    cycles/limb      cycles/limb
    37  C		       mul_2           addmul_2
    38  C UltraSPARC T3:	22.5		 23.5
    39  C UltraSPARC T4:	 3.25		 3.75
    40  
    41  
    42  C The code is reasonably scheduled but also relies on OoO.  There was hope that
    43  C this could run at around 3.0 and 3.5 c/l respectively, on T4.  Two cycles per
    44  C iteration needs to be removed.
    45  C
    46  C We could almost use 2-way unrolling, but currently the wN registers live too
    47  C long.  By changing add x,w1,w1 to add x,w1,w0, i.e. migrate the values down-
    48  C wards, 2-way unrolling should become possible.  With n-indexed addressing it
    49  C should run no slower.
    50  C
    51  C The rp loads to g1/g3 are very much over-scheduled.  Presumably, they could
    52  C be postponed a full way, and then just one register could be used.
    53  
    54  C INPUT PARAMETERS
    55  define(`rp', `%i0')
    56  define(`up', `%i1')
    57  define(`n',  `%i2')
    58  define(`vp', `%i3')
    59  
    60  define(`v0', `%o0')
    61  define(`v1', `%o1')
    62  
    63  define(`w0', `%o2')
    64  define(`w1', `%o3')
    65  define(`w2', `%o4')
    66  define(`w3', `%o5')
    67  
    68  ifdef(`OPERATION_mul_2',`
    69        define(`AM2',      `')
    70        define(`ADDX',	 `addcc`'$1')
    71        define(`func',     `mpn_mul_2')
    72  ')
    73  ifdef(`OPERATION_addmul_2',`
    74        define(`AM2',      `$1')
    75        define(`ADDX',	 `addxccc($1,$2,$3)')
    76        define(`func',     `mpn_addmul_2')
    77  ')
    78  
    79  
    80  MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2)
    81  
    82  ASM_START()
    83  	REGISTER(%g2,#scratch)
    84  	REGISTER(%g3,#scratch)
    85  PROLOGUE(func)
    86  	save	%sp, -176, %sp
    87  
    88  	ldx	[vp+0], v0		C load v0
    89  	and	n, 3, %g5
    90  	ldx	[vp+8], v1		C load v1
    91  	add	n, -6, n
    92  	ldx	[up+0], %g4
    93  	brz	%g5, L(b0)
    94  	 cmp	%g5, 2
    95  	bcs	L(b1)
    96  	 nop
    97  	be	L(b2)
    98  	 nop
    99  
   100  L(b3):
   101  AM2(`	ldx	[rp+0], %g1')
   102  	mulx	%g4, v0, w2
   103  	umulxhi(%g4, v0, w3)
   104  	ldx	[up+8], %i5
   105  	mulx	%g4, v1, %l3
   106  	umulxhi(%g4, v1, %l7)
   107  AM2(`	ldx	[rp+8], %g3')
   108  	add	up, -8, up
   109  	add	rp, -8, rp
   110  	b	L(lo3)
   111  	 mov	0, w0
   112  
   113  L(b2):
   114  AM2(`	ldx	[rp+0], %g3')
   115  	mulx	%g4, v0, w3
   116  	umulxhi(%g4, v0, w0)
   117  	ldx	[up+8], %i4
   118  	mulx	%g4, v1, %l1
   119  	umulxhi(%g4, v1, %l5)
   120  AM2(`	ldx	[rp+8], %g1')
   121  	add	rp, 16, rp
   122  	brlz	n, L(end)
   123  	 mov	0, w1
   124  	ba	L(top)
   125  	 add	up, 16, up
   126  
   127  L(b1):
   128  AM2(`	ldx	[rp+0], %g1')
   129  	mulx	%g4, v0, w0
   130  	umulxhi(%g4, v0, w1)
   131  	ldx	[up+8], %i5
   132  	mulx	%g4, v1, %l3
   133  	umulxhi(%g4, v1, %l7)
   134  AM2(`	ldx	[rp+8], %g3')
   135  	add	up, 8, up
   136  	add	rp, 8, rp
   137  	b	L(lo1)
   138  	 mov	0, w2
   139  
   140  L(b0):
   141  AM2(`	ldx	[rp+0], %g3')
   142  	mulx	%g4, v0, w1
   143  	umulxhi(%g4, v0, w2)
   144  	ldx	[up+8], %i4
   145  	mulx	%g4, v1, %l1
   146  	umulxhi(%g4, v1, %l5)
   147  AM2(`	ldx	[rp+8], %g1')
   148  	b	L(lo0)
   149  	 mov	0, w3
   150  
   151  	ALIGN(16)			C cycle
   152  L(top):	mulx	%i4, v0, %l2		C 0->5
   153  	umulxhi(%i4, v0, %l6)		C 0->5
   154  	ldx	[up+0], %i5		C 1->6
   155  AM2(`	addcc	w3, %g3, w3')		C 1
   156  	stx	w3, [rp-16]		C 2
   157  	ADDX(`	%l1, w0, w0')		C 2
   158  	addxccc(%l5, w1, w1)		C 3
   159  	mulx	%i4, v1, %l3		C 3->9
   160  	umulxhi(%i4, v1, %l7)		C 4->9
   161  AM2(`	ldx	[rp+0], %g3')		C 4
   162  	addcc	%l2, w0, w0		C 5
   163  	addxccc(%l6, w1, w1)		C 5
   164  	addxc(	%g0, %g0, w2)		C 6
   165  L(lo1):	mulx	%i5, v0, %l0		C 6
   166  	umulxhi(%i5, v0, %l4)		C 7
   167  	ldx	[up+8], %i4		C 7
   168  AM2(`	addcc	w0, %g1, w0')		C 8
   169  	stx	w0, [rp-8]		C 8
   170  	ADDX(`	%l3, w1, w1')		C 9
   171  	addxccc(%l7, w2, w2)		C 9
   172  	mulx	%i5, v1, %l1		C 10
   173  	umulxhi(%i5, v1, %l5)		C 10
   174  AM2(`	ldx	[rp+8], %g1')		C 11
   175  	addcc	%l0, w1, w1		C 11
   176  	addxccc(%l4, w2, w2)		C 12
   177  	addxc(	%g0, %g0, w3)		C 12
   178  L(lo0):	mulx	%i4, v0, %l2		C 13
   179  	umulxhi(%i4, v0, %l6)		C 13
   180  	ldx	[up+16], %i5		C 14
   181  AM2(`	addcc	w1, %g3, w1')		C 14
   182  	stx	w1, [rp+0]		C 15
   183  	ADDX(`	%l1, w2, w2')		C 15
   184  	addxccc(%l5, w3, w3)		C 16
   185  	mulx	%i4, v1, %l3		C 16
   186  	umulxhi(%i4, v1, %l7)		C 17
   187  AM2(`	ldx	[rp+16], %g3')		C 17
   188  	addcc	%l2, w2, w2		C 18
   189  	addxccc(%l6, w3, w3)		C 18
   190  	addxc(	%g0, %g0, w0)		C 19
   191  L(lo3):	mulx	%i5, v0, %l0		C 19
   192  	umulxhi(%i5, v0, %l4)		C 20
   193  	ldx	[up+24], %i4		C 20
   194  AM2(`	addcc	w2, %g1, w2')		C 21
   195  	stx	w2, [rp+8]		C 21
   196  	ADDX(`	%l3, w3, w3')		C 22
   197  	addxccc(%l7, w0, w0)		C 22
   198  	mulx	%i5, v1, %l1		C 23
   199  	umulxhi(%i5, v1, %l5)		C 23
   200  AM2(`	ldx	[rp+24], %g1')		C 24
   201  	addcc	%l0, w3, w3		C 24
   202  	addxccc(%l4, w0, w0)		C 25
   203  	addxc(	%g0, %g0, w1)		C 25
   204  	add	up, 32, up
   205  	add	rp, 32, rp
   206  	brgz	n, L(top)
   207  	 add	n, -4, n
   208  
   209  L(end):	mulx	%i4, v0, %l2
   210  	umulxhi(%i4, v0, %l6)
   211  AM2(`	addcc	w3, %g3, w3')
   212  	stx	w3, [rp-16]
   213  	ADDX(`	%l1, w0, w0')
   214  	addxccc(%l5, w1, w1)
   215  	mulx	%i4, v1, %l3
   216  	umulxhi(%i4, v1, %l7)
   217  	addcc	%l2, w0, w0
   218  	addxccc(%l6, w1, w1)
   219  	addxc(	%g0, %g0, w2)
   220  AM2(`	addcc	w0, %g1, w0')
   221  	stx	w0, [rp-8]
   222  	ADDX(`	%l3, w1, w1')
   223  	stx	w1, [rp+0]
   224  	addxc(%l7, w2, %i0)
   225  
   226  	ret
   227  	 restore
   228  EPILOGUE()