github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/nails/mul_1.asm (about)

     1  dnl  Alpha ev6 nails mpn_mul_1.
     2  
     3  dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C      cycles/limb
    34  C EV4:    42
    35  C EV5:    18
    36  C EV6:     3.25
    37  
    38  C TODO
    39  C  * Reroll loop for 3.0 c/l with current 4-way unrolling.
    40  C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
    41  C    umulh.
    42  C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
    43  C    and would work since the loop structure is really regular.
    44  
    45  C  INPUT PARAMETERS
    46  define(`rp',`r16')
    47  define(`up',`r17')
    48  define(`n', `r18')
    49  define(`vl0',`r19')
    50  
    51  define(`numb_mask',`r6')
    52  
    53  define(`m0a',`r0')
    54  define(`m0b',`r1')
    55  define(`m1a',`r2')
    56  define(`m1b',`r3')
    57  define(`m2a',`r20')
    58  define(`m2b',`r21')
    59  define(`m3a',`r22')
    60  define(`m3b',`r23')
    61  
    62  define(`acc0',`r25')
    63  define(`acc1',`r27')
    64  
    65  define(`ul0',`r4')
    66  define(`ul1',`r5')
    67  define(`ul2',`r4')
    68  define(`ul3',`r5')
    69  
    70  define(`rl0',`r24')
    71  define(`rl1',`r24')
    72  define(`rl2',`r24')
    73  define(`rl3',`r24')
    74  
    75  define(`t0',`r7')
    76  define(`t1',`r8')
    77  
    78  define(`NAIL_BITS',`GMP_NAIL_BITS')
    79  define(`NUMB_BITS',`GMP_NUMB_BITS')
    80  
    81  dnl  This declaration is munged by configure
    82  NAILS_SUPPORT(1-63)
    83  
    84  ASM_START()
    85  PROLOGUE(mpn_mul_1)
    86  	sll	vl0, NAIL_BITS, vl0
    87  	lda	numb_mask, -1(r31)
    88  	srl	numb_mask, NAIL_BITS, numb_mask
    89  
    90  	and	n,	3,	r25
    91  	cmpeq	r25,	1,	r21
    92  	bne	r21,	L(1m4)
    93  	cmpeq	r25,	2,	r21
    94  	bne	r21,	L(2m4)
    95  	beq	r25,	L(0m4)
    96  
    97  L(3m4):	ldq	ul3,	0(up)
    98  	lda	n,	-4(n)
    99  	ldq	ul0,	8(up)
   100  	mulq	vl0,	ul3,	m3a
   101  	umulh	vl0,	ul3,	m3b
   102  	ldq	ul1,	16(up)
   103  	lda	up,	24(up)
   104  	lda	rp,	-8(rp)
   105  	mulq	vl0,	ul0,	m0a
   106  	umulh	vl0,	ul0,	m0b
   107  	bge	n,	L(ge3)
   108  
   109  	mulq	vl0,	ul1,	m1a
   110  	umulh	vl0,	ul1,	m1b
   111  	srl	m3a,NAIL_BITS,	t0
   112  	addq	t0,	r31,	acc1
   113  	srl	m0a,NAIL_BITS,	t0
   114  	addq	t0,	m3b,	acc0
   115  	srl	acc1,NUMB_BITS,	t1
   116  	br	r31,	L(ta3)
   117  
   118  L(ge3):	ldq	ul2,	0(up)
   119  	mulq	vl0,	ul1,	m1a
   120  	umulh	vl0,	ul1,	m1b
   121  	srl	m3a,NAIL_BITS,	t0
   122  	ldq	ul3,	8(up)
   123  	lda	n,	-4(n)
   124  	mulq	vl0,	ul2,	m2a
   125  	addq	t0,	r31,	acc1
   126  	umulh	vl0,	ul2,	m2b
   127  	srl	m0a,NAIL_BITS,	t0
   128  	ldq	ul0,	16(up)
   129  	mulq	vl0,	ul3,	m3a
   130  	addq	t0,	m3b,	acc0
   131  	srl	acc1,NUMB_BITS,	t1
   132  	br	r31,	L(el3)
   133  
   134  L(0m4):	lda	n,	-8(n)
   135  	ldq	ul2,	0(up)
   136  	ldq	ul3,	8(up)
   137  	mulq	vl0,	ul2,	m2a
   138  	umulh	vl0,	ul2,	m2b
   139  	ldq	ul0,	16(up)
   140  	mulq	vl0,	ul3,	m3a
   141  	umulh	vl0,	ul3,	m3b
   142  	ldq	ul1,	24(up)
   143  	lda	up,	32(up)
   144  	mulq	vl0,	ul0,	m0a
   145  	umulh	vl0,	ul0,	m0b
   146  	bge	n,	L(ge4)
   147  
   148  	srl	m2a,NAIL_BITS,	t0
   149  	mulq	vl0,	ul1,	m1a
   150  	addq	t0,	r31,	acc0
   151  	umulh	vl0,	ul1,	m1b
   152  	srl	m3a,NAIL_BITS,	t0
   153  	addq	t0,	m2b,	acc1
   154  	srl	acc0,NUMB_BITS,	t1
   155  	br	r31,	L(ta4)
   156  
   157  L(ge4):	srl	m2a,NAIL_BITS,	t0
   158  	ldq	ul2,	0(up)
   159  	mulq	vl0,	ul1,	m1a
   160  	addq	t0,	r31,	acc0
   161  	umulh	vl0,	ul1,	m1b
   162  	srl	m3a,NAIL_BITS,	t0
   163  	ldq	ul3,	8(up)
   164  	lda	n,	-4(n)
   165  	mulq	vl0,	ul2,	m2a
   166  	addq	t0,	m2b,	acc1
   167  	srl	acc0,NUMB_BITS,	t1
   168  	br	r31,	L(el0)
   169  
   170  L(2m4):	lda	n,	-4(n)
   171  	ldq	ul0,	0(up)
   172  	ldq	ul1,	8(up)
   173  	lda	up,	16(up)
   174  	lda	rp,	-16(rp)
   175  	mulq	vl0,	ul0,	m0a
   176  	umulh	vl0,	ul0,	m0b
   177  	bge	n,	L(ge2)
   178  
   179  	mulq	vl0,	ul1,	m1a
   180  	umulh	vl0,	ul1,	m1b
   181  	srl	m0a,NAIL_BITS,	t0
   182  	addq	t0,	r31,	acc0
   183  	srl	m1a,NAIL_BITS,	t0
   184  	addq	t0,	m0b,	acc1
   185  	srl	acc0,NUMB_BITS,	t1
   186  	br	r31,	L(ta2)
   187  
   188  L(ge2):	ldq	ul2,	0(up)
   189  	mulq	vl0,	ul1,	m1a
   190  	umulh	vl0,	ul1,	m1b
   191  	ldq	ul3,	8(up)
   192  	lda	n,	-4(n)
   193  	mulq	vl0,	ul2,	m2a
   194  	umulh	vl0,	ul2,	m2b
   195  	srl	m0a,NAIL_BITS,	t0
   196  	ldq	ul0,	16(up)
   197  	mulq	vl0,	ul3,	m3a
   198  	addq	t0,	r31,	acc0
   199  	umulh	vl0,	ul3,	m3b
   200  	srl	m1a,NAIL_BITS,	t0
   201  	ldq	ul1,	24(up)
   202  	lda	up,	32(up)
   203  	lda	rp,	32(rp)
   204  	mulq	vl0,	ul0,	m0a
   205  	addq	t0,	m0b,	acc1
   206  	srl	acc0,NUMB_BITS,	t1
   207  	bge	n,	L(el2)
   208  
   209  	br	r31,	L(ta6)
   210  
   211  L(1m4):	lda	n,	-4(n)
   212  	ldq	ul1,	0(up)
   213  	lda	up,	8(up)
   214  	lda	rp,	-24(rp)
   215  	bge	n,	L(ge1)
   216  
   217  	mulq	vl0,	ul1,	m1a
   218  	umulh	vl0,	ul1,	m1b
   219  	srl	m1a,NAIL_BITS,	t0
   220  	addq	t0,	r31,	acc1
   221  	and	acc1,numb_mask,	r28
   222  	srl	acc1,NUMB_BITS,	t1
   223  	stq	r28,	24(rp)
   224  	addq	t1,	m1b,	r0
   225  	ret	r31,	(r26),	1
   226  
   227  L(ge1):	ldq	ul2,	0(up)
   228  	mulq	vl0,	ul1,	m1a
   229  	umulh	vl0,	ul1,	m1b
   230  	ldq	ul3,	8(up)
   231  	lda	n,	-4(n)
   232  	mulq	vl0,	ul2,	m2a
   233  	umulh	vl0,	ul2,	m2b
   234  	ldq	ul0,	16(up)
   235  	mulq	vl0,	ul3,	m3a
   236  	umulh	vl0,	ul3,	m3b
   237  	srl	m1a,NAIL_BITS,	t0
   238  	ldq	ul1,	24(up)
   239  	lda	up,	32(up)
   240  	lda	rp,	32(rp)
   241  	mulq	vl0,	ul0,	m0a
   242  	addq	t0,	r31,	acc1
   243  	umulh	vl0,	ul0,	m0b
   244  	srl	m2a,NAIL_BITS,	t0
   245  	mulq	vl0,	ul1,	m1a
   246  	addq	t0,	m1b,	acc0
   247  	srl	acc1,NUMB_BITS,	t1
   248  	blt	n,	L(ta5)
   249  
   250  L(ge5):	ldq	ul2,	0(up)
   251  	br	r31,	L(el1)
   252  
   253  	ALIGN(16)
   254  L(top):	mulq	vl0,	ul0,	m0a		C U1
   255  	addq	t0,	m0b,	acc1		C L0
   256  	srl	acc0,NUMB_BITS,	t1		C U0
   257  	stq	r28,	-24(rp)			C L1
   258  C
   259  L(el2):	umulh	vl0,	ul0,	m0b		C U1
   260  	and	acc0,numb_mask,	r28		C L0
   261  	unop					C U0
   262  	unop					C L1
   263  C
   264  	unop					C U1
   265  	addq	t1,	acc1,	acc1		C L0
   266  	srl	m2a,NAIL_BITS,	t0		C U0
   267  	ldq	ul2,	0(up)			C L1
   268  C
   269  	mulq	vl0,	ul1,	m1a		C U1
   270  	addq	t0,	m1b,	acc0		C L0
   271  	srl	acc1,NUMB_BITS,	t1		C U0
   272  	stq	r28,	-16(rp)			C L1
   273  C
   274  L(el1):	umulh	vl0,	ul1,	m1b		C U1
   275  	and	acc1,numb_mask,	r28		C L0
   276  	unop					C U0
   277  	lda	n,	-4(n)			C L1
   278  C
   279  	unop					C U1
   280  	addq	t1,	acc0,	acc0		C L0
   281  	srl	m3a,NAIL_BITS,	t0		C U0
   282  	ldq	ul3,	8(up)			C L1
   283  C
   284  	mulq	vl0,	ul2,	m2a		C U1
   285  	addq	t0,	m2b,	acc1		C L0
   286  	srl	acc0,NUMB_BITS,	t1		C U0
   287  	stq	r28,	-8(rp)			C L1
   288  C
   289  L(el0):	umulh	vl0,	ul2,	m2b		C U1
   290  	and	acc0,numb_mask,	r28		C L0
   291  	unop					C U0
   292  	unop					C L1
   293  C
   294  	unop					C U1
   295  	addq	t1,	acc1,	acc1		C L0
   296  	srl	m0a,NAIL_BITS,	t0		C U0
   297  	ldq	ul0,	16(up)			C L1
   298  C
   299  	mulq	vl0,	ul3,	m3a		C U1
   300  	addq	t0,	m3b,	acc0		C L0
   301  	srl	acc1,NUMB_BITS,	t1		C U0
   302  	stq	r28,	0(rp)			C L1
   303  C
   304  L(el3):	umulh	vl0,	ul3,	m3b		C U1
   305  	and	acc1,numb_mask,	r28		C L0
   306  	unop					C U0
   307  	unop					C L1
   308  C
   309  	unop					C U1
   310  	addq	t1,	acc0,	acc0		C L0
   311  	srl	m1a,NAIL_BITS,	t0		C U0
   312  	ldq	ul1,	24(up)			C L1
   313  C
   314  	lda	up,	32(up)			C L0
   315  	unop					C U1
   316  	lda	rp,	32(rp)			C L1
   317  	bge	n,	L(top)			C U0
   318  
   319  L(end):	mulq	vl0,	ul0,	m0a
   320  	addq	t0,	m0b,	acc1
   321  	srl	acc0,NUMB_BITS,	t1
   322  	stq	r28,	-24(rp)
   323  L(ta6):	umulh	vl0,	ul0,	m0b
   324  	and	acc0,numb_mask,	r28
   325  	addq	t1,	acc1,	acc1
   326  	srl	m2a,NAIL_BITS,	t0
   327  	mulq	vl0,	ul1,	m1a
   328  	addq	t0,	m1b,	acc0
   329  	srl	acc1,NUMB_BITS,	t1
   330  	stq	r28,	-16(rp)
   331  L(ta5):	umulh	vl0,	ul1,	m1b
   332  	and	acc1,numb_mask,	r28
   333  	addq	t1,	acc0,	acc0
   334  	srl	m3a,NAIL_BITS,	t0
   335  	addq	t0,	m2b,	acc1
   336  	srl	acc0,NUMB_BITS,	t1
   337  	stq	r28,	-8(rp)
   338  	ALIGN(16)
   339  L(ta4):	and	acc0,numb_mask,	r28
   340  	addq	t1,	acc1,	acc1
   341  	srl	m0a,NAIL_BITS,	t0
   342  	addq	t0,	m3b,	acc0
   343  	srl	acc1,NUMB_BITS,	t1
   344  	stq	r28,	0(rp)
   345  	unop
   346  	ALIGN(16)
   347  L(ta3):	and	acc1,numb_mask,	r28
   348  	addq	t1,	acc0,	acc0
   349  	srl	m1a,NAIL_BITS,	t0
   350  	addq	t0,	m0b,	acc1
   351  	srl	acc0,NUMB_BITS,	t1
   352  	stq	r28,	8(rp)
   353  	unop
   354  	ALIGN(16)
   355  L(ta2):	and	acc0,numb_mask,	r28
   356  	addq	t1,	acc1,	acc1
   357  	srl	acc1,NUMB_BITS,	t1
   358  	stq	r28,	16(rp)
   359  	and	acc1,numb_mask,	r28
   360  	addq	t1,	m1b,	r0
   361  	stq	r28,	24(rp)
   362  	ret	r31,	(r26),	1
   363  EPILOGUE()
   364  ASM_END()