github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/aorsmul_1.asm (about)

     1  dnl  Alpha ev6 mpn_addmul_1 and mpn_submul_1.
     2  
     3  dnl  Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C      cycles/limb
    34  C EV4:    42
    35  C EV5:    18
    36  C EV6:     3.5
    37  
    38  C  INPUT PARAMETERS
    39  define(`rp',	`r16')
    40  define(`up',	`r17')
    41  define(`n',	`r18')
    42  define(`v0',	`r19')
    43  
    44  dnl  This code was written in cooperation with ev6 pipeline expert Steve Root.
    45  
    46  dnl  The stores can issue a cycle late so we have paired no-op's to 'catch'
    47  dnl  them, so that further disturbance to the schedule is damped.
    48  
    49  dnl  We couldn't pair the loads, because the entangled schedule of the carry's
    50  dnl  has to happen on one side {0} of the machine.
    51  
    52  dnl  This is a great schedule for the d_cache, a poor schedule for the b_cache.
    53  dnl  The lockup on U0 means that any stall can't be recovered from.  Consider a
    54  dnl  ldq in L1, say that load gets stalled because it collides with a fill from
    55  dnl  the b_cache.  On the next cycle, this load gets priority.  If first looks
    56  dnl  at L0, and goes there.  The instruction we intended for L0 gets to look at
    57  dnl  L1, which is NOT where we want it.  It either stalls 1, because it can't
    58  dnl  go in L0, or goes there, and causes a further instruction to stall.
    59  
    60  dnl  So for b_cache, we're likely going to want to put one or more cycles back
    61  dnl  into the code! And, of course, put in lds prefetch for the rp[] operand.
    62  dnl  At a place where we have an mt followed by a bookkeeping, put the
    63  dnl  bookkeeping in upper, and the prefetch into lower.
    64  
    65  dnl  Note, the ldq's and stq's are at the end of the quadpacks.  Note, we'd
    66  dnl  like not to have an ldq or an stq to preceded a conditional branch in a
    67  dnl  quadpack.  The conditional branch moves the retire pointer one cycle
    68  dnl  later.
    69  
    70  ifdef(`OPERATION_addmul_1',`
    71      define(`ADDSUB',	`addq')
    72      define(`CMPCY',	`cmpult	$2,$1')
    73      define(`func',	`mpn_addmul_1')
    74  ')
    75  ifdef(`OPERATION_submul_1',`
    76      define(`ADDSUB',	`subq')
    77      define(`CMPCY',	`cmpult	$1,$2')
    78      define(`func',	`mpn_submul_1')
    79  ')
    80  
    81  MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
    82  
    83  ASM_START()
    84  PROLOGUE(func)
    85  	ldq	r3,	0(up)		C
    86  	and	r18,	7,	r20	C
    87  	lda	r18,	-9(r18)		C
    88  	cmpeq	r20,	1,	r21	C
    89  	beq	r21,	$L1		C
    90  
    91  $1mod8:	ldq	r5,	0(rp)		C
    92  	mulq	v0,	r3,	r7	C
    93  	umulh	v0,	r3,	r8	C
    94  	ADDSUB	r5,	r7,	r23	C
    95  	CMPCY(	r5,	r23),	r20	C
    96  	addq	r8,	r20,	r0	C
    97  	stq	r23,	0(rp)		C
    98  	bge	r18,	$ent1		C
    99  	ret	r31,	(r26),	1	C
   100  
   101  $L1:	lda	r8,	0(r31)		C zero carry reg
   102  	lda	r24,	0(r31)		C zero carry reg
   103  	cmpeq	r20,	2,	r21	C
   104  	bne	r21,	$2mod8		C
   105  	cmpeq	r20,	3,	r21	C
   106  	bne	r21,	$3mod8		C
   107  	cmpeq	r20,	4,	r21	C
   108  	bne	r21,	$4mod8		C
   109  	cmpeq	r20,	5,	r21	C
   110  	bne	r21,	$5mod8		C
   111  	cmpeq	r20,	6,	r21	C
   112  	bne	r21,	$6mod8		C
   113  	cmpeq	r20,	7,	r21	C
   114  	beq	r21,	$0mod8		C
   115  
   116  $7mod8:	ldq	r5,	0(rp)		C
   117  	lda	up,	8(up)		C
   118  	mulq	v0,	r3,	r7	C
   119  	umulh	v0,	r3,	r24	C
   120  	ADDSUB	r5,	r7,	r23	C
   121  	CMPCY(	r5,	r23),	r20	C
   122  	addq	r24,	r20,	r24	C
   123  	stq	r23,	0(rp)		C
   124  	lda	rp,	8(rp)		C
   125  	ldq	r3,	0(up)		C
   126  $6mod8:	ldq	r1,	8(up)		C
   127  	mulq	v0,	r3,	r25	C
   128  	umulh	v0,	r3,	r3	C
   129  	mulq	v0,	r1,	r28	C
   130  	ldq	r0,	16(up)		C
   131  	ldq	r4,	0(rp)		C
   132  	umulh	v0,	r1,	r8	C
   133  	ldq	r1,	24(up)		C
   134  	lda	up,	48(up)		C L1 bookkeeping
   135  	mulq	v0,	r0,	r2	C
   136  	ldq	r5,	8(rp)		C
   137  	lda	rp,	-32(rp)		C L1 bookkeeping
   138  	umulh	v0,	r0,	r6	C
   139  	ADDSUB	r4,	r25,	r25	C lo + acc
   140  	mulq	v0,	r1,	r7	C
   141  	br	r31,	$ent6		C
   142  
   143  $ent1:	lda	up,	8(up)		C
   144  	lda	rp,	8(rp)		C
   145  	lda	r8,	0(r0)		C
   146  	ldq	r3,	0(up)		C
   147  $0mod8:	ldq	r1,	8(up)		C
   148  	mulq	v0,	r3,	r2	C
   149  	umulh	v0,	r3,	r6	C
   150  	mulq	v0,	r1,	r7	C
   151  	ldq	r0,	16(up)		C
   152  	ldq	r4,	0(rp)		C
   153  	umulh	v0,	r1,	r24	C
   154  	ldq	r1,	24(up)		C
   155  	mulq	v0,	r0,	r25	C
   156  	ldq	r5,	8(rp)		C
   157  	umulh	v0,	r0,	r3	C
   158  	ADDSUB	r4,	r2,	r2	C lo + acc
   159  	mulq	v0,	r1,	r28	C
   160  	lda	rp,	-16(rp)		C
   161  	br	r31,	$ent0		C
   162  
   163  $3mod8:	ldq	r5,	0(rp)		C
   164  	lda	up,	8(up)		C
   165  	mulq	v0,	r3,	r7	C
   166  	umulh	v0,	r3,	r8	C
   167  	ADDSUB	r5,	r7,	r23	C
   168  	CMPCY(	r5,	r23),	r20	C
   169  	addq	r8,	r20,	r24	C
   170  	stq	r23,	0(rp)		C
   171  	lda	rp,	8(rp)		C
   172  	ldq	r3,	0(up)		C
   173  $2mod8:	ldq	r1,	8(up)		C
   174  	mulq	v0,	r3,	r25	C
   175  	umulh	v0,	r3,	r3	C
   176  	mulq	v0,	r1,	r28	C
   177  	ble	r18,	$n23		C
   178  	ldq	r0,	16(up)		C
   179  	ldq	r4,	0(rp)		C
   180  	umulh	v0,	r1,	r8	C
   181  	ldq	r1,	24(up)		C
   182  	lda	up,	16(up)		C L1 bookkeeping
   183  	mulq	v0,	r0,	r2	C
   184  	ldq	r5,	8(rp)		C
   185  	lda	rp,	0(rp)		C L1 bookkeeping
   186  	umulh	v0,	r0,	r6	C
   187  	ADDSUB	r4,	r25,	r25	C lo + acc
   188  	mulq	v0,	r1,	r7	C
   189  	br	r31,	$ent2		C
   190  
   191  $5mod8:	ldq	r5,	0(rp)		C
   192  	lda	up,	8(up)		C
   193  	mulq	v0,	r3,	r7	C
   194  	umulh	v0,	r3,	r24	C
   195  	ADDSUB	r5,	r7,	r23	C
   196  	CMPCY(	r5,	r23),	r20	C
   197  	addq	r24,	r20,	r8	C
   198  	stq	r23,	0(rp)		C
   199  	lda	rp,	8(rp)		C
   200  	ldq	r3,	0(up)		C
   201  $4mod8:	ldq	r1,	8(up)		C
   202  	mulq	v0,	r3,	r2	C
   203  	umulh	v0,	r3,	r6	C
   204  	mulq	v0,	r1,	r7	C
   205  	ldq	r0,	16(up)		C
   206  	ldq	r4,	0(rp)		C
   207  	umulh	v0,	r1,	r24	C
   208  	ldq	r1,	24(up)		C
   209  	lda	up,	32(up)		C L1 bookkeeping
   210  	mulq	v0,	r0,	r25	C
   211  	ldq	r5,	8(rp)		C
   212  	lda	rp,	16(rp)		C L1 bookkeeping
   213  	umulh	v0,	r0,	r3	C
   214  	ADDSUB	r4,	r2,	r2	C lo + acc
   215  	mulq	v0,	r1,	r28	C
   216  	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
   217  	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
   218  	ble	r18,	$Lend		C
   219  	ALIGN(16)
   220  $Loop:
   221  	bis	r31,	r31,	r31	C U1 mt
   222  	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
   223  	addq	r6,	r20,	r6	C U0 hi mul + carry
   224  	ldq	r0,	0(up)		C
   225  
   226  	bis	r31,	r31,	r31	C U1 mt
   227  	ADDSUB	r5,	r7,	r7	C L0 lo + acc
   228  	addq	r6,	r21,	r6	C U0 hi mul + carry
   229  	ldq	r4,	0(rp)		C L1
   230  
   231  	umulh	v0,	r1,	r8	C U1
   232  	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
   233  	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
   234  	ldq	r1,	8(up)		C L1
   235  
   236  	mulq	v0,	r0,	r2	C U1
   237  	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
   238  	addq	r24,	r20,	r24	C U0 hi mul + carry
   239  	ldq	r5,	8(rp)		C L1
   240  
   241  	umulh	v0,	r0,	r6	C U1
   242  	ADDSUB	r4,	r25,	r25	C U0 lo + acc
   243  	stq	r22,	-16(rp)		C L0
   244  	stq	r23,	-8(rp)		C L1
   245  
   246  	bis	r31,	r31,	r31	C L0 st slosh
   247  	mulq	v0,	r1,	r7	C U1
   248  	bis	r31,	r31,	r31	C L1 st slosh
   249  	addq	r24,	r21,	r24	C U0 hi mul + carry
   250  $ent2:
   251  	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
   252  	bis	r31,	r31,	r31	C U1 mt
   253  	lda	r18,	-8(r18)		C L1 bookkeeping
   254  	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
   255  
   256  	bis	r31,	r31,	r31	C U1 mt
   257  	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
   258  	addq	r3,	r20,	r3	C U0 hi mul + carry
   259  	ldq	r0,	16(up)		C L1
   260  
   261  	bis	r31,	r31,	r31	C U1 mt
   262  	ADDSUB	r5,	r28,	r28	C L0 lo + acc
   263  	addq	r3,	r21,	r3	C U0 hi mul + carry
   264  	ldq	r4,	16(rp)		C L1
   265  
   266  	umulh	v0,	r1,	r24	C U1
   267  	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
   268  	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
   269  	ldq	r1,	24(up)		C L1
   270  
   271  	mulq	v0,	r0,	r25	C U1
   272  	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
   273  	addq	r8,	r20,	r8	C U0 hi mul + carry
   274  	ldq	r5,	24(rp)		C L1
   275  
   276  	umulh	v0,	r0,	r3	C U1
   277  	ADDSUB	r4,	r2,	r2	C U0 lo + acc
   278  	stq	r22,	0(rp)		C L0
   279  	stq	r23,	8(rp)		C L1
   280  
   281  	bis	r31,	r31,	r31	C L0 st slosh
   282  	mulq	v0,	r1,	r28	C U1
   283  	bis	r31,	r31,	r31	C L1 st slosh
   284  	addq	r8,	r21,	r8	C U0 hi mul + carry
   285  $ent0:
   286  	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
   287  	bis	r31,	r31,	r31	C U1 mt
   288  	lda	up,	64(up)		C L1 bookkeeping
   289  	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
   290  
   291  	bis	r31,	r31,	r31	C U1 mt
   292  	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
   293  	addq	r6,	r20,	r6	C U0 hi mul + carry
   294  	ldq	r0,	-32(up)		C L1
   295  
   296  	bis	r31,	r31,	r31	C U1 mt
   297  	ADDSUB	r5,	r7,	r7	C L0 lo + acc
   298  	addq	r6,	r21,	r6	C U0 hi mul + carry
   299  	ldq	r4,	32(rp)		C L1
   300  
   301  	umulh	v0,	r1,	r8	C U1
   302  	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
   303  	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
   304  	ldq	r1,	-24(up)		C L1
   305  
   306  	mulq	v0,	r0,	r2	C U1
   307  	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
   308  	addq	r24,	r20,	r24	C U0 hi mul + carry
   309  	ldq	r5,	40(rp)		C L1
   310  
   311  	umulh	v0,	r0,	r6	C U1
   312  	ADDSUB	r4,	r25,	r25	C U0 lo + acc
   313  	stq	r22,	16(rp)		C L0
   314  	stq	r23,	24(rp)		C L1
   315  
   316  	bis	r31,	r31,	r31	C L0 st slosh
   317  	mulq	v0,	r1,	r7	C U1
   318  	bis	r31,	r31,	r31	C L1 st slosh
   319  	addq	r24,	r21,	r24	C U0 hi mul + carry
   320  $ent6:
   321  	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
   322  	bis	r31,	r31,	r31	C U1 mt
   323  	lda	rp,	64(rp)		C L1 bookkeeping
   324  	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
   325  
   326  	bis	r31,	r31,	r31	C U1 mt
   327  	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
   328  	addq	r3,	r20,	r3	C U0 hi mul + carry
   329  	ldq	r0,	-16(up)		C L1
   330  
   331  	bis	r31,	r31,	r31	C U1 mt
   332  	ADDSUB	r5,	r28,	r28	C L0 lo + acc
   333  	addq	r3,	r21,	r3	C U0 hi mul + carry
   334  	ldq	r4,	-16(rp)		C L1
   335  
   336  	umulh	v0,	r1,	r24	C U1
   337  	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
   338  	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
   339  	ldq	r1,	-8(up)		C L1
   340  
   341  	mulq	v0,	r0,	r25	C U1
   342  	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
   343  	addq	r8,	r20,	r8	C U0 hi mul + carry
   344  	ldq	r5,	-8(rp)		C L1
   345  
   346  	umulh	v0,	r0,	r3	C U1
   347  	ADDSUB	r4,	r2,	r2	C U0 lo + acc
   348  	stq	r22,	-32(rp)		C L0
   349  	stq	r23,	-24(rp)		C L1
   350  
   351  	bis	r31,	r31,	r31	C L0 st slosh
   352  	mulq	v0,	r1,	r28	C U1
   353  	bis	r31,	r31,	r31	C L1 st slosh
   354  	addq	r8,	r21,	r8	C U0 hi mul + carry
   355  
   356  	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
   357  	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
   358  	ldl	r31,	256(up)		C prefetch up[]
   359  	bgt	r18,	$Loop		C U1 bookkeeping
   360  
   361  $Lend:	CMPCY(	r2,	r22),	r21	C
   362  	addq	r6,	r20,	r6	C
   363  	ADDSUB	r5,	r7,	r7	C
   364  	addq	r6,	r21,	r6	C
   365  	ldq	r4,	0(rp)		C
   366  	umulh	v0,	r1,	r8	C
   367  	CMPCY(	r5,	r7),	r20	C
   368  	ADDSUB	r7,	r6,	r23	C
   369  	CMPCY(r7,	r23),	r21	C
   370  	addq	r24,	r20,	r24	C
   371  	ldq	r5,	8(rp)		C
   372  	ADDSUB	r4,	r25,	r25	C
   373  	stq	r22,	-16(rp)		C
   374  	stq	r23,	-8(rp)		C
   375  	addq	r24,	r21,	r24	C
   376  	br	L(x)
   377  
   378  	ALIGN(16)
   379  $n23:	ldq	r4,	0(rp)		C
   380  	ldq	r5,	8(rp)		C
   381  	umulh	v0,	r1,	r8	C
   382  	ADDSUB	r4,	r25,	r25	C
   383  L(x):	CMPCY(	r4,	r25),	r20	C
   384  	ADDSUB	r25,	r24,	r22	C
   385  	CMPCY(	r25,	r22),	r21	C
   386  	addq	r3,	r20,	r3	C
   387  	ADDSUB	r5,	r28,	r28	C
   388  	addq	r3,	r21,	r3	C
   389  	CMPCY(	r5,	r28),	r20	C
   390  	ADDSUB	r28,	r3,	r23	C
   391  	CMPCY(	r28,	r23),	r21	C
   392  	addq	r8,	r20,	r8	C
   393  	stq	r22,	0(rp)		C
   394  	stq	r23,	8(rp)		C
   395  	addq	r8,	r21,	r0	C
   396  	ret	r31,	(r26),	1	C
   397  EPILOGUE()
   398  ASM_END()