github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/nails/submul_1.asm (about)

     1  dnl  Alpha ev6 nails mpn_submul_1.
     2  
     3  dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C      cycles/limb
    34  C EV4:    42
    35  C EV5:    18
    36  C EV6:     4
    37  
    38  C TODO
    39  C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
    40  C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
    41  C    umulh.
    42  C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
    43  C    and would work since the loop structure is really regular.
    44  
    45  C  INPUT PARAMETERS
    46  define(`rp',`r16')
    47  define(`up',`r17')
    48  define(`n', `r18')
    49  define(`vl0',`r19')
    50  
    51  define(`numb_mask',`r6')
    52  
    53  define(`m0a',`r0')
    54  define(`m0b',`r1')
    55  define(`m1a',`r2')
    56  define(`m1b',`r3')
    57  define(`m2a',`r20')
    58  define(`m2b',`r21')
    59  define(`m3a',`r22')
    60  define(`m3b',`r23')
    61  
    62  define(`acc0',`r25')
    63  define(`acc1',`r27')
    64  
    65  define(`ul0',`r4')
    66  define(`ul1',`r5')
    67  define(`ul2',`r4')
    68  define(`ul3',`r5')
    69  
    70  define(`rl0',`r24')
    71  define(`rl1',`r24')
    72  define(`rl2',`r24')
    73  define(`rl3',`r24')
    74  
    75  define(`t0',`r7')
    76  define(`t1',`r8')
    77  
    78  define(`NAIL_BITS',`GMP_NAIL_BITS')
    79  define(`NUMB_BITS',`GMP_NUMB_BITS')
    80  
    81  dnl  This declaration is munged by configure
    82  NAILS_SUPPORT(2-63)
    83  
    84  ASM_START()
    85  PROLOGUE(mpn_submul_1)
    86  	sll	vl0, NAIL_BITS, vl0
    87  	lda	numb_mask, -1(r31)
    88  	srl	numb_mask, NAIL_BITS, numb_mask
    89  
    90  	and	n,	3,	r25
    91  	cmpeq	r25,	1,	r21
    92  	bne	r21,	L(1m4)
    93  	cmpeq	r25,	2,	r21
    94  	bne	r21,	L(2m4)
    95  	beq	r25,	L(0m4)
    96  
    97  L(3m4):	ldq	ul3,	0(up)
    98  	lda	n,	-4(n)
    99  	ldq	ul0,	8(up)
   100  	mulq	vl0,	ul3,	m3a
   101  	umulh	vl0,	ul3,	m3b
   102  	ldq	ul1,	16(up)
   103  	lda	up,	24(up)
   104  	lda	rp,	-8(rp)
   105  	mulq	vl0,	ul0,	m0a
   106  	umulh	vl0,	ul0,	m0b
   107  	bge	n,	L(ge3)
   108  
   109  	mulq	vl0,	ul1,	m1a
   110  	umulh	vl0,	ul1,	m1b
   111  	ldq	rl3,	8(rp)
   112  	srl	m3a,NAIL_BITS,	t0
   113  	addq	t0,	r31,	acc1
   114  	subq	rl3,	acc1,	acc1
   115  	ldq	rl0,	16(rp)
   116  	srl	m0a,NAIL_BITS,	t0
   117  	addq	t0,	m3b,	acc0
   118  	sra	acc1,NUMB_BITS,	t1
   119  	br	r31,	L(ta3)
   120  
   121  L(ge3):	ldq	ul2,	0(up)
   122  	mulq	vl0,	ul1,	m1a
   123  	umulh	vl0,	ul1,	m1b
   124  	ldq	rl3,	8(rp)
   125  	srl	m3a,NAIL_BITS,	t0
   126  	ldq	ul3,	8(up)
   127  	lda	n,	-4(n)
   128  	mulq	vl0,	ul2,	m2a
   129  	addq	t0,	r31,	acc1
   130  	umulh	vl0,	ul2,	m2b
   131  	subq	rl3,	acc1,	acc1
   132  	ldq	rl0,	16(rp)
   133  	srl	m0a,NAIL_BITS,	t0
   134  	ldq	ul0,	16(up)
   135  	mulq	vl0,	ul3,	m3a
   136  	addq	t0,	m3b,	acc0
   137  	sra	acc1,NUMB_BITS,	t1
   138  	br	r31,	L(el3)
   139  
   140  L(0m4):	lda	n,	-8(n)
   141  	ldq	ul2,	0(up)
   142  	ldq	ul3,	8(up)
   143  	mulq	vl0,	ul2,	m2a
   144  	umulh	vl0,	ul2,	m2b
   145  	ldq	ul0,	16(up)
   146  	mulq	vl0,	ul3,	m3a
   147  	umulh	vl0,	ul3,	m3b
   148  	ldq	ul1,	24(up)
   149  	lda	up,	32(up)
   150  	mulq	vl0,	ul0,	m0a
   151  	umulh	vl0,	ul0,	m0b
   152  	bge	n,	L(ge4)
   153  
   154  	ldq	rl2,	0(rp)
   155  	srl	m2a,NAIL_BITS,	t0
   156  	mulq	vl0,	ul1,	m1a
   157  	addq	t0,	r31,	acc0
   158  	umulh	vl0,	ul1,	m1b
   159  	subq	rl2,	acc0,	acc0
   160  	ldq	rl3,	8(rp)
   161  	srl	m3a,NAIL_BITS,	t0
   162  	addq	t0,	m2b,	acc1
   163  	sra	acc0,NUMB_BITS,	t1
   164  	br	r31,	L(ta4)
   165  
   166  L(ge4):	ldq	rl2,	0(rp)
   167  	srl	m2a,NAIL_BITS,	t0
   168  	ldq	ul2,	0(up)
   169  	mulq	vl0,	ul1,	m1a
   170  	addq	t0,	r31,	acc0
   171  	umulh	vl0,	ul1,	m1b
   172  	subq	rl2,	acc0,	acc0
   173  	ldq	rl3,	8(rp)
   174  	srl	m3a,NAIL_BITS,	t0
   175  	ldq	ul3,	8(up)
   176  	lda	n,	-4(n)
   177  	mulq	vl0,	ul2,	m2a
   178  	addq	t0,	m2b,	acc1
   179  	sra	acc0,NUMB_BITS,	t1
   180  	br	r31,	L(el0)
   181  
   182  L(2m4):	lda	n,	-4(n)
   183  	ldq	ul0,	0(up)
   184  	ldq	ul1,	8(up)
   185  	lda	up,	16(up)
   186  	lda	rp,	-16(rp)
   187  	mulq	vl0,	ul0,	m0a
   188  	umulh	vl0,	ul0,	m0b
   189  	bge	n,	L(ge2)
   190  
   191  	mulq	vl0,	ul1,	m1a
   192  	umulh	vl0,	ul1,	m1b
   193  	ldq	rl0,	16(rp)
   194  	srl	m0a,NAIL_BITS,	t0
   195  	addq	t0,	r31,	acc0
   196  	subq	rl0,	acc0,	acc0
   197  	ldq	rl1,	24(rp)
   198  	srl	m1a,NAIL_BITS,	t0
   199  	addq	t0,	m0b,	acc1
   200  	sra	acc0,NUMB_BITS,	t1
   201  	br	r31,	L(ta2)
   202  
   203  L(ge2):	ldq	ul2,	0(up)
   204  	mulq	vl0,	ul1,	m1a
   205  	umulh	vl0,	ul1,	m1b
   206  	ldq	ul3,	8(up)
   207  	lda	n,	-4(n)
   208  	mulq	vl0,	ul2,	m2a
   209  	umulh	vl0,	ul2,	m2b
   210  	ldq	rl0,	16(rp)
   211  	srl	m0a,NAIL_BITS,	t0
   212  	ldq	ul0,	16(up)
   213  	mulq	vl0,	ul3,	m3a
   214  	addq	t0,	r31,	acc0
   215  	umulh	vl0,	ul3,	m3b
   216  	subq	rl0,	acc0,	acc0
   217  	ldq	rl1,	24(rp)
   218  	srl	m1a,NAIL_BITS,	t0
   219  	ldq	ul1,	24(up)
   220  	lda	up,	32(up)
   221  	lda	rp,	32(rp)
   222  	mulq	vl0,	ul0,	m0a
   223  	addq	t0,	m0b,	acc1
   224  	sra	acc0,NUMB_BITS,	t1
   225  	bge	n,	L(el2)
   226  
   227  	br	r31,	L(ta6)
   228  
   229  L(1m4):	lda	n,	-4(n)
   230  	ldq	ul1,	0(up)
   231  	lda	up,	8(up)
   232  	lda	rp,	-24(rp)
   233  	bge	n,	L(ge1)
   234  
   235  	mulq	vl0,	ul1,	m1a
   236  	umulh	vl0,	ul1,	m1b
   237  	ldq	rl1,	24(rp)
   238  	srl	m1a,NAIL_BITS,	t0
   239  	subq	rl1,	t0,	acc1
   240  	and	acc1,numb_mask,	r28
   241  	sra	acc1,NUMB_BITS,	t1
   242  	stq	r28,	24(rp)
   243  	subq	m1b,	t1,	r0
   244  	ret	r31,	(r26),	1
   245  
   246  L(ge1):	ldq	ul2,	0(up)
   247  	mulq	vl0,	ul1,	m1a
   248  	umulh	vl0,	ul1,	m1b
   249  	ldq	ul3,	8(up)
   250  	lda	n,	-4(n)
   251  	mulq	vl0,	ul2,	m2a
   252  	umulh	vl0,	ul2,	m2b
   253  	ldq	ul0,	16(up)
   254  	mulq	vl0,	ul3,	m3a
   255  	umulh	vl0,	ul3,	m3b
   256  	ldq	rl1,	24(rp)
   257  	srl	m1a,NAIL_BITS,	t0
   258  	ldq	ul1,	24(up)
   259  	lda	up,	32(up)
   260  	lda	rp,	32(rp)
   261  	mulq	vl0,	ul0,	m0a
   262  	addq	t0,	r31,	acc1
   263  	umulh	vl0,	ul0,	m0b
   264  	subq	rl1,	acc1,	acc1
   265  	ldq	rl2,	0(rp)
   266  	srl	m2a,NAIL_BITS,	t0
   267  	mulq	vl0,	ul1,	m1a
   268  	addq	t0,	m1b,	acc0
   269  	sra	acc1,NUMB_BITS,	t1
   270  	blt	n,	L(ta5)
   271  
   272  L(ge5):	ldq	ul2,	0(up)
   273  	br	r31,	L(el1)
   274  
   275  	ALIGN(16)
   276  L(top):	mulq	vl0,	ul0,	m0a		C U1
   277  	addq	t0,	m0b,	acc1		C L0
   278  	sra	acc0,NUMB_BITS,	t1		C U0
   279  	stq	r28,	-24(rp)			C L1
   280  C
   281  L(el2):	umulh	vl0,	ul0,	m0b		C U1
   282  	and	acc0,numb_mask,	r28		C L0
   283  	subq	rl1,	acc1,	acc1		C U0
   284  	ldq	rl2,	0(rp)			C L1
   285  C
   286  	unop					C U1
   287  	addq	t1,	acc1,	acc1		C L0
   288  	srl	m2a,NAIL_BITS,	t0		C U0
   289  	ldq	ul2,	0(up)			C L1
   290  C
   291  	mulq	vl0,	ul1,	m1a		C U1
   292  	addq	t0,	m1b,	acc0		C L0
   293  	sra	acc1,NUMB_BITS,	t1		C U0
   294  	stq	r28,	-16(rp)			C L1
   295  C
   296  L(el1):	umulh	vl0,	ul1,	m1b		C U1
   297  	and	acc1,numb_mask,	r28		C L0
   298  	subq	rl2,	acc0,	acc0		C U0
   299  	ldq	rl3,	8(rp)			C L1
   300  C
   301  	lda	n,	-4(n)			C L1
   302  	addq	t1,	acc0,	acc0		C L0
   303  	srl	m3a,NAIL_BITS,	t0		C U0
   304  	ldq	ul3,	8(up)			C L1
   305  C
   306  	mulq	vl0,	ul2,	m2a		C U1
   307  	addq	t0,	m2b,	acc1		C L0
   308  	sra	acc0,NUMB_BITS,	t1		C U0
   309  	stq	r28,	-8(rp)			C L1
   310  C
   311  L(el0):	umulh	vl0,	ul2,	m2b		C U1
   312  	and	acc0,numb_mask,	r28		C L0
   313  	subq	rl3,	acc1,	acc1		C U0
   314  	ldq	rl0,	16(rp)			C L1
   315  C
   316  	unop					C U1
   317  	addq	t1,	acc1,	acc1		C L0
   318  	srl	m0a,NAIL_BITS,	t0		C U0
   319  	ldq	ul0,	16(up)			C L1
   320  C
   321  	mulq	vl0,	ul3,	m3a		C U1
   322  	addq	t0,	m3b,	acc0		C L0
   323  	sra	acc1,NUMB_BITS,	t1		C U0
   324  	stq	r28,	0(rp)			C L1
   325  C
   326  L(el3):	umulh	vl0,	ul3,	m3b		C U1
   327  	and	acc1,numb_mask,	r28		C L0
   328  	subq	rl0,	acc0,	acc0		C U0
   329  	ldq	rl1,	24(rp)			C L1
   330  C
   331  	unop					C U1
   332  	addq	t1,	acc0,	acc0		C L0
   333  	srl	m1a,NAIL_BITS,	t0		C U0
   334  	ldq	ul1,	24(up)			C L1
   335  C
   336  	lda	up,	32(up)			C L0
   337  	unop					C U1
   338  	lda	rp,	32(rp)			C L1
   339  	bge	n,	L(top)			C U0
   340  
   341  L(end):	mulq	vl0,	ul0,	m0a
   342  	addq	t0,	m0b,	acc1
   343  	sra	acc0,NUMB_BITS,	t1
   344  	stq	r28,	-24(rp)
   345  L(ta6):	umulh	vl0,	ul0,	m0b
   346  	and	acc0,numb_mask,	r28
   347  	subq	rl1,	acc1,	acc1
   348  	ldq	rl2,	0(rp)
   349  	addq	t1,	acc1,	acc1
   350  	srl	m2a,NAIL_BITS,	t0
   351  	mulq	vl0,	ul1,	m1a
   352  	addq	t0,	m1b,	acc0
   353  	sra	acc1,NUMB_BITS,	t1
   354  	stq	r28,	-16(rp)
   355  L(ta5):	umulh	vl0,	ul1,	m1b
   356  	and	acc1,numb_mask,	r28
   357  	subq	rl2,	acc0,	acc0
   358  	ldq	rl3,	8(rp)
   359  	addq	t1,	acc0,	acc0
   360  	srl	m3a,NAIL_BITS,	t0
   361  	addq	t0,	m2b,	acc1
   362  	sra	acc0,NUMB_BITS,	t1
   363  	stq	r28,	-8(rp)
   364  	unop
   365  	ALIGN(16)
   366  L(ta4):	and	acc0,numb_mask,	r28
   367  	subq	rl3,	acc1,	acc1
   368  	ldq	rl0,	16(rp)
   369  	addq	t1,	acc1,	acc1
   370  	srl	m0a,NAIL_BITS,	t0
   371  	addq	t0,	m3b,	acc0
   372  	sra	acc1,NUMB_BITS,	t1
   373  	stq	r28,	0(rp)
   374  	unop
   375  	ALIGN(16)
   376  L(ta3):	and	acc1,numb_mask,	r28
   377  	subq	rl0,	acc0,	acc0
   378  	ldq	rl1,	24(rp)
   379  	addq	t1,	acc0,	acc0
   380  	srl	m1a,NAIL_BITS,	t0
   381  	addq	t0,	m0b,	acc1
   382  	sra	acc0,NUMB_BITS,	t1
   383  	stq	r28,	8(rp)
   384  	unop
   385  	ALIGN(16)
   386  L(ta2):	and	acc0,numb_mask,	r28
   387  	subq	rl1,	acc1,	acc1
   388  	addq	t1,	acc1,	acc1
   389  	sra	acc1,NUMB_BITS,	t1
   390  	stq	r28,	16(rp)
   391  	and	acc1,numb_mask,	r28
   392  	subq	m1b,	t1,	r0
   393  	stq	r28,	24(rp)
   394  	ret	r31,	(r26),	1
   395  EPILOGUE()
   396  ASM_END()