github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev5/diveby3.asm (about)

     1  dnl  Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder.
     2  
     3  dnl  Copyright 2004, 2005, 2009 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C      cycles/limb
    34  C EV4:    22
    35  C EV5:    11.5
    36  C EV6:     6.3		Note that mpn_bdiv_dbm1c is faster
    37  
    38  C TODO
    39  C  * Remove the unops, they benefit just ev6, which no longer uses this file.
    40  C  * Try prefetch for destination, using lds.
    41  C  * Improve feed-in code, by moving initial mulq earlier; make initial load
    42  C    to u0/u0 to save some copying.
    43  C  * Combine u0 and u2, u1 and u3.
    44  
    45  C INPUT PARAMETERS
    46  define(`rp',	`r16')
    47  define(`up',	`r17')
    48  define(`n',	`r18')
    49  define(`cy',	`r19')
    50  
    51  ASM_START()
    52  
    53  DATASTART(L(LC),8)
    54  	.quad	0xAAAAAAAAAAAAAAAB
    55  	.quad	0x5555555555555555
    56  	.quad	0xAAAAAAAAAAAAAAAA
    57  DATAEND()
    58  
    59  define(`xAAAAAAAAAAAAAAAB',	`r20')
    60  define(`x5555555555555555',	`r21')
    61  define(`xAAAAAAAAAAAAAAAA',	`r22')
    62  define(`u0',	`r0')	define(`u1',	`r1')
    63  define(`u2',	`r2')	define(`u3',	`r3')
    64  define(`l0',	`r25')	define(`x',	`r8')
    65  define(`q0',	`r4')	define(`q1',	`r5')
    66  define(`p6',	`r6')	define(`p7',	`r7')
    67  define(`t0',	`r23')	define(`t1',	`r24')
    68  define(`cymask',`r28')
    69  
    70  
    71  PROLOGUE(mpn_divexact_by3c,gp)
    72  
    73  	ldq	r28, 0(up)			C load first limb early
    74  
    75  C Put magic constants in registers
    76  	lda	r0, L(LC)
    77  	ldq	xAAAAAAAAAAAAAAAB, 0(r0)
    78  	ldq	x5555555555555555, 8(r0)
    79  	ldq	xAAAAAAAAAAAAAAAA, 16(r0)
    80  
    81  C Compute initial l0 value
    82  	cmpeq	cy, 1, p6
    83  	cmpeq	cy, 2, p7
    84  	negq	p6, p6
    85  	and	p6, x5555555555555555, l0
    86  	cmovne	p7, xAAAAAAAAAAAAAAAA, l0
    87  
    88  C Feed-in depending on (n mod 4)
    89  	and	n, 3, r8
    90  	lda	n, -3(n)
    91  	cmpeq	r8, 1, r4
    92  	cmpeq	r8, 2, r5
    93  	bne	r4, $Lb01
    94  	bne	r5, $Lb10
    95  	beq	r8, $Lb00
    96  
    97  $Lb11:	ldq	u3, 8(up)
    98  	lda	up, -24(up)
    99  	lda	rp, -24(rp)
   100  	mulq	r28, xAAAAAAAAAAAAAAAB, q0
   101  	mov	r28, u2
   102  	br	r31, $L11
   103  
   104  $Lb00:	ldq	u2, 8(up)
   105  	lda	up, -16(up)
   106  	lda	rp, -16(rp)
   107  	mulq	r28, xAAAAAAAAAAAAAAAB, q1
   108  	mov	r28, u1
   109  	br	r31, $L00
   110  
   111  $Lb01:	lda	rp, -8(rp)
   112  	mulq	r28, xAAAAAAAAAAAAAAAB, q0
   113  	mov	r28, u0
   114  	blt	n, $Lcj1
   115  	ldq	u1, 8(up)
   116  	lda	up, -8(up)
   117  	br	r31, $L01
   118  
   119  $Lb10:	ldq	u0, 8(up)
   120  	mulq	r28, xAAAAAAAAAAAAAAAB, q1
   121  	mov	r28, u3
   122  	blt	n, $Lend
   123  
   124  	ALIGN(16)
   125  $Ltop:
   126  C 0
   127  	cmpult	u3, cy, cy			C L0
   128  	mulq	u0, xAAAAAAAAAAAAAAAB, q0	C U1
   129  	ldq	u1, 16(up)			C L1
   130  	addq	q1, l0, x			C U0
   131  C 1
   132  	negq	cy, cymask			C L0
   133  	unop					C U1
   134  	unop					C L1
   135  	cmpult	x5555555555555555, x, p6	C U0
   136  C 2
   137  	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
   138  	unop
   139  	unop
   140  	negq	p6, t0				C L0
   141  C 3
   142  	negq	p7, t1				C L0
   143  	and	cymask, x5555555555555555, l0	C U1
   144  	addq	p6, cy, cy
   145  	and	t0, x5555555555555555, t0
   146  C 4
   147  	and	t1, x5555555555555555, t1
   148  	addq	p7, cy, cy
   149  	unop
   150  	addq	t0, l0, l0
   151  C 5
   152  	addq	t1, l0, l0
   153  	unop
   154  	stq	x, 0(rp)			C L1
   155  	unop
   156  $L01:
   157  C 0
   158  	cmpult	u0, cy, cy			C L0
   159  	mulq	u1, xAAAAAAAAAAAAAAAB, q1	C U1
   160  	ldq	u2, 24(up)			C L1
   161  	addq	q0, l0, x			C U0
   162  C 1
   163  	negq	cy, cymask			C L0
   164  	unop					C U1
   165  	unop					C L1
   166  	cmpult	x5555555555555555, x, p6	C U0
   167  C 2
   168  	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
   169  	unop
   170  	unop
   171  	negq	p6, t0				C L0
   172  C 3
   173  	negq	p7, t1				C L0
   174  	and	cymask, x5555555555555555, l0	C U1
   175  	addq	p6, cy, cy
   176  	and	t0, x5555555555555555, t0
   177  C 4
   178  	and	t1, x5555555555555555, t1
   179  	addq	p7, cy, cy
   180  	unop
   181  	addq	t0, l0, l0
   182  C 5
   183  	addq	t1, l0, l0
   184  	unop
   185  	stq	x, 8(rp)			C L1
   186  	unop
   187  $L00:
   188  C 0
   189  	cmpult	u1, cy, cy			C L0
   190  	mulq	u2, xAAAAAAAAAAAAAAAB, q0	C U1
   191  	ldq	u3, 32(up)			C L1
   192  	addq	q1, l0, x			C U0
   193  C 1
   194  	negq	cy, cymask			C L0
   195  	unop					C U1
   196  	unop					C L1
   197  	cmpult	x5555555555555555, x, p6	C U0
   198  C 2
   199  	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
   200  	unop
   201  	unop
   202  	negq	p6, t0				C L0
   203  C 3
   204  	negq	p7, t1				C L0
   205  	and	cymask, x5555555555555555, l0	C U1
   206  	addq	p6, cy, cy
   207  	and	t0, x5555555555555555, t0
   208  C 4
   209  	and	t1, x5555555555555555, t1
   210  	addq	p7, cy, cy
   211  	unop
   212  	addq	t0, l0, l0
   213  C 5
   214  	addq	t1, l0, l0
   215  	unop
   216  	stq	x, 16(rp)			C L1
   217  	unop
   218  $L11:
   219  C 0
   220  	cmpult	u2, cy, cy			C L0
   221  	mulq	u3, xAAAAAAAAAAAAAAAB, q1	C U1
   222  	ldq	u0, 40(up)			C L1
   223  	addq	q0, l0, x			C U0
   224  C 1
   225  	negq	cy, cymask			C L0
   226  	unop					C U1
   227  	unop					C L1
   228  	cmpult	x5555555555555555, x, p6	C U0
   229  C 2
   230  	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
   231  	lda	n, -4(n)			C L1 bookkeeping
   232  	unop
   233  	negq	p6, t0				C L0
   234  C 3
   235  	negq	p7, t1				C L0
   236  	and	cymask, x5555555555555555, l0	C U1
   237  	addq	p6, cy, cy
   238  	and	t0, x5555555555555555, t0
   239  C 4
   240  	and	t1, x5555555555555555, t1
   241  	addq	p7, cy, cy
   242  	unop
   243  	addq	t0, l0, l0
   244  C 5
   245  	addq	t1, l0, l0
   246  	unop
   247  	stq	x, 24(rp)			C L1
   248  	lda	up, 32(up)
   249  C
   250  	ldl	r31, 256(up)			C prefetch
   251  	unop
   252  	lda	rp, 32(rp)
   253  	bge	n, $Ltop			C U1
   254  C *** MAIN LOOP END ***
   255  $Lend:
   256  
   257  	cmpult	u3, cy, cy			C L0
   258  	mulq	u0, xAAAAAAAAAAAAAAAB, q0	C U1
   259  	unop
   260  	addq	q1, l0, x			C U0
   261  C 1
   262  	negq	cy, cymask			C L0
   263  	unop					C U1
   264  	unop					C L1
   265  	cmpult	x5555555555555555, x, p6	C U0
   266  C 2
   267  	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
   268  	unop
   269  	unop
   270  	negq	p6, t0				C L0
   271  C 3
   272  	negq	p7, t1				C L0
   273  	and	cymask, x5555555555555555, l0	C U1
   274  	addq	p6, cy, cy
   275  	and	t0, x5555555555555555, t0
   276  C 4
   277  	and	t1, x5555555555555555, t1
   278  	addq	p7, cy, cy
   279  	unop
   280  	addq	t0, l0, l0
   281  C 5
   282  	addq	t1, l0, l0
   283  	unop
   284  	stq	x, 0(rp)			C L1
   285  	unop
   286  $Lcj1:
   287  	cmpult	u0, cy, cy			C L0
   288  	addq	q0, l0, x			C U0
   289  	cmpult	x5555555555555555, x, p6	C U0
   290  	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
   291  	addq	p6, cy, cy
   292  	addq	p7, cy, r0
   293  	stq	x, 8(rp)			C L1
   294  
   295  	ret	r31,(r26),1
   296  EPILOGUE()
   297  ASM_END()
   298  
   299  C This is useful for playing with various schedules.
   300  C Expand as: one(0)one(1)one(2)one(3)
   301  define(`one',`
   302  C 0
   303  	cmpult	`$'eval(($1+3)%4), cy, cy		C L0
   304  	mulq	`$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1
   305  	ldq	`$'eval(($1+1)%4), eval($1*8+16)(up)	C L1
   306  	addq	`$'eval(4+($1+1)%2), l0, x		C U0
   307  C 1
   308  	negq	cy, cymask				C L0
   309  	unop						C U1
   310  	unop						C L1
   311  	cmpult	x5555555555555555, x, p6		C U0
   312  C 2
   313  	cmpult	xAAAAAAAAAAAAAAAA, x, p7		C U1
   314  	unop
   315  	unop
   316  	negq	p6, t0					C L0
   317  C 3
   318  	negq	p7, t1					C L0
   319  	and	cymask, x5555555555555555, l0		C U1
   320  	addq	p6, cy, cy
   321  	and	t0, x5555555555555555, t0
   322  C 4
   323  	and	t1, x5555555555555555, t1
   324  	addq	p7, cy, cy
   325  	unop
   326  	addq	t0, l0, l0
   327  C 5
   328  	addq	t1, l0, l0
   329  	unop
   330  	stq	x, eval($1*8)(rp)			C L1
   331  	unop
   332  ')