github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/sparc64/ultrasparct3/aormul_4.asm (about)

     1  dnl  SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C		    cycles/limb      cycles/limb
    37  C		       mul_4           addmul_4
    38  C UltraSPARC T3:	21.5		22.0
    39  C UltraSPARC T4:	 2.625		 2.75
    40  
    41  
    42  C The code is well-scheduled and relies on OoO very little.  There is hope that
    43  C this will run at around 2.5 and 2.75 c/l respectively, on T4.
    44  
    45  define(`rp', `%i0')
    46  define(`up', `%i1')
    47  define(`n',  `%i2')
    48  define(`vp', `%i3')
    49  
    50  define(`v0', `%g1')
    51  define(`v1', `%o7')
    52  define(`v2', `%g2')
    53  define(`v3', `%i3')
    54  
    55  define(`w0', `%o0')
    56  define(`w1', `%o1')
    57  define(`w2', `%o2')
    58  define(`w3', `%o3')
    59  define(`w4', `%o4')
    60  
    61  define(`r0', `%o5')
    62  
    63  define(`u0', `%i4')
    64  define(`u1', `%i5')
    65  
    66  define(`rp0', `rp')
    67  define(`rp1', `%g3')
    68  define(`rp2', `%g4')
    69  define(`up0', `up')
    70  define(`up1', `%g5')
    71  
    72  ifdef(`OPERATION_mul_4',`
    73        define(`AM4',      `')
    74        define(`ADDX',	 `addcc`'$1')
    75        define(`func',     `mpn_mul_4')
    76  ')
    77  ifdef(`OPERATION_addmul_4',`
    78        define(`AM4',      `$1')
    79        define(`ADDX',	 `addxccc($1,$2,$3)')
    80        define(`func',     `mpn_addmul_4')
    81  ')
    82  
    83  
    84  MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4)
    85  
    86  ASM_START()
    87  	REGISTER(%g2,#scratch)
    88  	REGISTER(%g3,#scratch)
    89  PROLOGUE(func)
    90  	save	%sp, -176, %sp
    91  
    92  	ldx	[up + 0], u1		C load up[0] early
    93  	andcc	n, 1, %g0		C is n odd?
    94  	ldx	[vp + 0], v0
    95  	sllx	n, 3, n
    96  	ldx	[vp + 8], v1
    97  	add	n, -28, n
    98  	ldx	[vp + 16], v2
    99  	add	rp, -16, rp
   100  	ldx	[vp + 24], v3
   101  	add	up, n, up0
   102  	add	rp, n, rp0
   103  	add	up0, 8, up1
   104  	add	rp0, 8, rp1
   105  	add	rp0, 16, rp2
   106  	mulx	u1, v0, %l0
   107  	mov	0, w0
   108  	mulx	u1, v1, %l1
   109  	mov	0, w1
   110  	mulx	u1, v2, %l2
   111  	mov	0, w2
   112  	mulx	u1, v3, %l3
   113  	mov	0, w3
   114  
   115  	be	L(evn)
   116  	 neg	n, n
   117  
   118  L(odd):	mov	u1, u0
   119  	ldx	[up1 + n], u1
   120  AM4(`	ldx	[rp2 + n], r0')
   121  	umulxhi(u0, v0, %l4)
   122  	umulxhi(u0, v1, %l5)
   123  	umulxhi(u0, v2, %l6)
   124  	umulxhi(u0, v3, %l7)
   125  	b	L(mid)
   126  	 add	n, 8, n
   127  
   128  L(evn):	ldx	[up1 + n], u0
   129  AM4(`	ldx	[rp2 + n], r0')
   130  	umulxhi(u1, v0, %l4)
   131  	umulxhi(u1, v1, %l5)
   132  	umulxhi(u1, v2, %l6)
   133  	umulxhi(u1, v3, %l7)
   134  	add	n, 16, n
   135  
   136  	ALIGN(16)
   137  L(top):	addcc	%l0, w0, w0
   138  	mulx	u0, v0, %l0	C w 0
   139  	addxccc(%l1, w1, w1)
   140  	mulx	u0, v1, %l1	C w 1
   141  	addxccc(%l2, w2, w2)
   142  	mulx	u0, v2, %l2	C w 2
   143  	addxccc(%l3, w3, w3)
   144  	mulx	u0, v3, %l3	C w 3
   145  	ldx	[up0 + n], u1
   146  	addxc(	%g0, %g0, w4)
   147  AM4(`	addcc	r0, w0, w0')
   148  	stx	w0, [rp0 + n]
   149  	ADDX(`	%l4, w1, w0')
   150  	umulxhi(u0, v0, %l4)	C w 1
   151  AM4(`	ldx	[rp1 + n], r0')
   152  	addxccc(%l5, w2, w1)
   153  	umulxhi(u0, v1, %l5)	C w 2
   154  	addxccc(%l6, w3, w2)
   155  	umulxhi(u0, v2, %l6)	C w 3
   156  	addxc(	%l7, w4, w3)
   157  	umulxhi(u0, v3, %l7)	C w 4
   158  L(mid):	addcc	%l0, w0, w0
   159  	mulx	u1, v0, %l0	C w 1
   160  	addxccc(%l1, w1, w1)
   161  	mulx	u1, v1, %l1	C w 2
   162  	addxccc(%l2, w2, w2)
   163  	mulx	u1, v2, %l2	C w 3
   164  	addxccc(%l3, w3, w3)
   165  	mulx	u1, v3, %l3	C w 4
   166  	ldx	[up1 + n], u0
   167  	addxc(	%g0, %g0, w4)
   168  AM4(`	addcc	r0, w0, w0')
   169  	stx	w0, [rp1 + n]
   170  	ADDX(`	%l4, w1, w0')
   171  	umulxhi(u1, v0, %l4)	C w 2
   172  AM4(`	ldx	[rp2 + n], r0')
   173  	addxccc(%l5, w2, w1)
   174  	umulxhi(u1, v1, %l5)	C w 3
   175  	addxccc(%l6, w3, w2)
   176  	umulxhi(u1, v2, %l6)	C w 4
   177  	addxc(	%l7, w4, w3)
   178  	umulxhi(u1, v3, %l7)	C w 5
   179  	brlz	n, L(top)
   180  	 add	n, 16, n
   181  
   182  L(end):	addcc	%l0, w0, w0
   183  	mulx	u0, v0, %l0
   184  	addxccc(%l1, w1, w1)
   185  	mulx	u0, v1, %l1
   186  	addxccc(%l2, w2, w2)
   187  	mulx	u0, v2, %l2
   188  	addxccc(%l3, w3, w3)
   189  	mulx	u0, v3, %l3
   190  	addxc(	%g0, %g0, w4)
   191  AM4(`	addcc	r0, w0, w0')
   192  	stx	w0, [rp0 + n]
   193  	ADDX(`	%l4, w1, w0')
   194  	umulxhi(u0, v0, %l4)
   195  AM4(`	ldx	[rp1 + n], r0')
   196  	addxccc(%l5, w2, w1)
   197  	umulxhi(u0, v1, %l5)
   198  	addxccc(%l6, w3, w2)
   199  	umulxhi(u0, v2, %l6)
   200  	addxc(	%l7, w4, w3)
   201  	umulxhi(u0, v3, %l7)
   202  	addcc	%l0, w0, w0
   203  	addxccc(%l1, w1, w1)
   204  	addxccc(%l2, w2, w2)
   205  	addxccc(%l3, w3, w3)
   206  	addxc(	%g0, %g0, w4)
   207  AM4(`	addcc	r0, w0, w0')
   208  	stx	w0, [rp1 + n]
   209  	ADDX(`	%l4, w1, w0')
   210  	addxccc(%l5, w2, w1)
   211  	addxccc(%l6, w3, w2)
   212  	stx	w0, [rp2 + n]
   213  	add	n, 16, n
   214  	stx	w1, [rp1 + n]
   215  	stx	w2, [rp2 + n]
   216  	addxc(	%l7, w4, %i0)
   217  	ret
   218  	 restore
   219  EPILOGUE()