github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/mul_1.asm (about)

     1  dnl  Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
     2  dnl  result in a second limb vector.
     3  
     4  dnl  Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C INPUT PARAMETERS
    35  C res_ptr	r16
    36  C s1_ptr	r17
    37  C size		r18
    38  C s2_limb	r19
    39  
    40  C This code runs at 2.25 cycles/limb on EV6.
    41  
    42  C This code was written in close cooperation with ev6 pipeline expert
    43  C Steve Root.  Any errors are tege's fault, though.
    44  
    45  C Code structure:
    46  
    47  C  code for n < 8
    48  C  code for n > 8	code for (n mod 8)
    49  C			code for (n div 8)	feed-in code
    50  C						8-way unrolled loop
    51  C						wind-down code
    52  
    53  C Some notes about unrolled loop:
    54  C
    55  C   r1-r8     multiplies and workup
    56  C   r21-r28   multiplies and workup
    57  C   r9-r12    loads
    58  C   r0       -1
    59  C   r20,r29,r13-r15  scramble
    60  C
    61  C   We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
    62  C   put-the-carry-into-hi.  The idea is that these branches are very rarely
    63  C   taken, and since a non-taken branch consumes no resources, that is better
    64  C   than an addq.
    65  C
    66  C   Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
    67  C   add NEXT cycle #09 which feeds a store in NEXT cycle #02
    68  
    69  C The code could use some further work:
    70  C   1. Speed up really small multiplies.  The default alpha/mul_1.asm code is
    71  C      faster than this for size < 3.
    72  C   2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
    73  C      that is too costly.
    74  C   3. Consider using 4-way unrolling, even if that runs slower.
    75  C   4. Reduce register usage.  In particular, try to avoid using r29.
    76  
    77  ASM_START()
    78  PROLOGUE(mpn_mul_1)
    79  	cmpult	r18,	8,	r1
    80  	beq	r1,	$Large
    81  $Lsmall:
    82  	ldq	r2,0(r17)	C r2 = s1_limb
    83  	lda	r18,-1(r18)	C size--
    84  	mulq	r2,r19,r3	C r3 = prod_low
    85  	bic	r31,r31,r4	C clear cy_limb
    86  	umulh	r2,r19,r0	C r0 = prod_high
    87  	beq	r18,$Le1a	C jump if size was == 1
    88  	ldq	r2,8(r17)	C r2 = s1_limb
    89  	lda	r18,-1(r18)	C size--
    90  	stq	r3,0(r16)
    91  	beq	r18,$Le2a	C jump if size was == 2
    92  	ALIGN(8)
    93  $Lopa:	mulq	r2,r19,r3	C r3 = prod_low
    94  	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
    95  	lda	r18,-1(r18)	C size--
    96  	umulh	r2,r19,r4	C r4 = cy_limb
    97  	ldq	r2,16(r17)	C r2 = s1_limb
    98  	lda	r17,8(r17)	C s1_ptr++
    99  	addq	r3,r0,r3	C r3 = cy_limb + prod_low
   100  	stq	r3,8(r16)
   101  	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
   102  	lda	r16,8(r16)	C res_ptr++
   103  	bne	r18,$Lopa
   104  
   105  $Le2a:	mulq	r2,r19,r3	C r3 = prod_low
   106  	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
   107  	umulh	r2,r19,r4	C r4 = cy_limb
   108  	addq	r3,r0,r3	C r3 = cy_limb + prod_low
   109  	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
   110  	stq	r3,8(r16)
   111  	addq	r4,r0,r0	C cy_limb = prod_high + cy
   112  	ret	r31,(r26),1
   113  $Le1a:	stq	r3,0(r16)
   114  	ret	r31,(r26),1
   115  
   116  $Large:
   117  	lda	r30,	-224(r30)
   118  	stq	r26,	0(r30)
   119  	stq	r9,	8(r30)
   120  	stq	r10,	16(r30)
   121  	stq	r11,	24(r30)
   122  	stq	r12,	32(r30)
   123  	stq	r13,	40(r30)
   124  	stq	r14,	48(r30)
   125  	stq	r15,	56(r30)
   126  	stq	r29,	64(r30)
   127  
   128  	and	r18,	7,	r20	C count for the first loop, 0-7
   129  	srl	r18,	3,	r18	C count for unrolled loop
   130  	bis	r31,	r31,	r21
   131  	beq	r20,	$L_8_or_more	C skip first loop
   132  
   133  $L_9_or_more:
   134  	ldq	r2,0(r17)	C r2 = s1_limb
   135  	lda	r17,8(r17)	C s1_ptr++
   136  	lda	r20,-1(r20)	C size--
   137  	mulq	r2,r19,r3	C r3 = prod_low
   138  	umulh	r2,r19,r21	C r21 = prod_high
   139  	beq	r20,$Le1b	C jump if size was == 1
   140  	bis	r31, r31, r0	C FIXME: shouldn't need this
   141  	ldq	r2,0(r17)	C r2 = s1_limb
   142  	lda	r17,8(r17)	C s1_ptr++
   143  	lda	r20,-1(r20)	C size--
   144  	stq	r3,0(r16)
   145  	lda	r16,8(r16)	C res_ptr++
   146  	beq	r20,$Le2b	C jump if size was == 2
   147  	ALIGN(8)
   148  $Lopb:	mulq	r2,r19,r3	C r3 = prod_low
   149  	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
   150  	lda	r20,-1(r20)	C size--
   151  	umulh	r2,r19,r21	C r21 = prod_high
   152  	ldq	r2,0(r17)	C r2 = s1_limb
   153  	lda	r17,8(r17)	C s1_ptr++
   154  	addq	r3,r0,r3	C r3 = cy_limb + prod_low
   155  	stq	r3,0(r16)
   156  	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
   157  	lda	r16,8(r16)	C res_ptr++
   158  	bne	r20,$Lopb
   159  
   160  $Le2b:	mulq	r2,r19,r3	C r3 = prod_low
   161  	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
   162  	umulh	r2,r19,r21	C r21 = prod_high
   163  	addq	r3,r0,r3	C r3 = cy_limb + prod_low
   164  	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
   165  	stq	r3,0(r16)
   166  	lda	r16,8(r16)	C res_ptr++
   167  	addq	r21,r0,r21	C cy_limb = prod_high + cy
   168  	br	r31,	$L_8_or_more
   169  $Le1b:	stq	r3,0(r16)
   170  	lda	r16,8(r16)	C res_ptr++
   171  
   172  $L_8_or_more:
   173  	lda	r0,	-1(r31)		C put -1 in r0, for tricky loop control
   174  	lda	r17,	-32(r17)	C L1 bookkeeping
   175  	lda	r18,	-1(r18)		C decrement count
   176  
   177  	ldq	r9,	32(r17)		C L1
   178  	ldq	r10,	40(r17)		C L1
   179  	mulq	r9,	r19,	r22	C U1 #07
   180  	ldq	r11,	48(r17)		C L1
   181  	umulh	r9,	r19,	r23	C U1 #08
   182  	ldq	r12,	56(r17)		C L1
   183  	mulq	r10,	r19,	r24	C U1 #09
   184  	ldq	r9,	64(r17)		C L1
   185  
   186  	lda	r17,	64(r17)		C L1 bookkeeping
   187  
   188  	umulh	r10,	r19,	r25	C U1 #11
   189  	mulq	r11,	r19,	r26	C U1 #12
   190  	umulh	r11,	r19,	r27	C U1 #13
   191  	mulq	r12,	r19,	r28	C U1 #14
   192  	ldq	r10,	8(r17)		C L1
   193  	umulh	r12,	r19,	r1	C U1 #15
   194  	ldq	r11,	16(r17)		C L1
   195  	mulq	r9,	r19,	r2	C U1 #16
   196  	ldq	r12,	24(r17)		C L1
   197  	umulh	r9,	r19,	r3	C U1 #17
   198  	addq	r21,	r22,	r13	C L1 mov
   199  	mulq	r10,	r19,	r4	C U1 #18
   200  	addq	r23,	r24,	r22	C L0 sum 2 mul's
   201  	cmpult	r13,	r21,	r14	C L1 carry from sum
   202  	bgt	r18,	$L_16_or_more
   203  
   204  	cmpult	r22,	r24,	r24	C U0 carry from sum
   205  	umulh	r10,	r19,	r5	C U1 #02
   206  	addq	r25,	r26,	r23	C U0 sum 2 mul's
   207  	mulq	r11,	r19,	r6	C U1 #03
   208  	cmpult	r23,	r26,	r25	C U0 carry from sum
   209  	umulh	r11,	r19,	r7	C U1 #04
   210  	addq	r27,	r28,	r28	C U0 sum 2 mul's
   211  	mulq	r12,	r19,	r8	C U1 #05
   212  	cmpult	r28,	r27,	r15	C L0 carry from sum
   213  	lda	r16,	32(r16)		C L1 bookkeeping
   214  	addq	r13,	r31,	r13	C U0 start carry cascade
   215  	umulh	r12,	r19,	r21	C U1 #06
   216  	br	r31,	$ret0c
   217  
   218  $L_16_or_more:
   219  C ---------------------------------------------------------------
   220  	subq	r18,1,r18
   221  	cmpult	r22,	r24,	r24	C U0 carry from sum
   222  	ldq	r9,	32(r17)		C L1
   223  
   224  	umulh	r10,	r19,	r5	C U1 #02
   225  	addq	r25,	r26,	r23	C U0 sum 2 mul's
   226  	mulq	r11,	r19,	r6	C U1 #03
   227  	cmpult	r23,	r26,	r25	C U0 carry from sum
   228  	umulh	r11,	r19,	r7	C U1 #04
   229  	addq	r27,	r28,	r28	C U0 sum 2 mul's
   230  	mulq	r12,	r19,	r8	C U1 #05
   231  	cmpult	r28,	r27,	r15	C L0 carry from sum
   232  	lda	r16,	32(r16)		C L1 bookkeeping
   233  	addq	r13,	r31,	r13	C U0 start carry cascade
   234  
   235  	umulh	r12,	r19,	r21	C U1 #06
   236  C	beq	r13,	$fix0w		C U0
   237  $ret0w:	addq	r22,	r14,	r26	C L0
   238  	ldq	r10,	40(r17)		C L1
   239  
   240  	mulq	r9,	r19,	r22	C U1 #07
   241  	beq	r26,	$fix1w		C U0
   242  $ret1w:	addq	r23,	r24,	r27	C L0
   243  	ldq	r11,	48(r17)		C L1
   244  
   245  	umulh	r9,	r19,	r23	C U1 #08
   246  	beq	r27,	$fix2w		C U0
   247  $ret2w:	addq	r28,	r25,	r28	C L0
   248  	ldq	r12,	56(r17)		C L1
   249  
   250  	mulq	r10,	r19,	r24	C U1 #09
   251  	beq	r28,	$fix3w		C U0
   252  $ret3w:	addq	r1,	r2,	r20	C L0 sum 2 mul's
   253  	ldq	r9,	64(r17)		C L1
   254  
   255  	addq	r3,	r4,	r2	C L0 #10 2 mul's
   256  	lda	r17,	64(r17)		C L1 bookkeeping
   257  	cmpult	r20,	r1,	r29	C U0 carry from sum
   258  
   259  	umulh	r10,	r19,	r25	C U1 #11
   260  	cmpult	r2,	r4,	r4	C U0 carry from sum
   261  	stq	r13,	-32(r16)	C L0
   262  	stq	r26,	-24(r16)	C L1
   263  
   264  	mulq	r11,	r19,	r26	C U1 #12
   265  	addq	r5,	r6,	r14	C U0 sum 2 mul's
   266  	stq	r27,	-16(r16)	C L0
   267  	stq	r28,	-8(r16)		C L1
   268  
   269  	umulh	r11,	r19,	r27	C U1 #13
   270  	cmpult	r14,	r6,	r3	C U0 carry from sum
   271  C could do cross-jumping here:
   272  C	bra	$L_middle_of_unrolled_loop
   273  	mulq	r12,	r19,	r28	C U1 #14
   274  	addq	r7,	r3,	r5	C L0 eat carry
   275  	addq	r20,	r15,	r20	C U0 carry cascade
   276  	ldq	r10,	8(r17)		C L1
   277  
   278  	umulh	r12,	r19,	r1	C U1 #15
   279  	beq	r20,	$fix4		C U0
   280  $ret4w:	addq	r2,	r29,	r6	C L0
   281  	ldq	r11,	16(r17)		C L1
   282  
   283  	mulq	r9,	r19,	r2	C U1 #16
   284  	beq	r6,	$fix5		C U0
   285  $ret5w:	addq	r14,	r4,	r7	C L0
   286  	ldq	r12,	24(r17)		C L1
   287  
   288  	umulh	r9,	r19,	r3	C U1 #17
   289  	beq	r7,	$fix6		C U0
   290  $ret6w:	addq	r5,	r8,	r8	C L0 sum 2
   291  	addq	r21,	r22,	r13	C L1 sum 2 mul's
   292  
   293  	mulq	r10,	r19,	r4	C U1 #18
   294  	addq	r23,	r24,	r22	C L0 sum 2 mul's
   295  	cmpult	r13,	r21,	r14	C L1 carry from sum
   296  	ble	r18,	$Lend		C U0
   297  C ---------------------------------------------------------------
   298  	ALIGN(16)
   299  $Loop:
   300  	umulh	r0,	r18,	r18	C U1 #01 decrement r18!
   301  	cmpult	r8,	r5,	r29	C L0 carry from last bunch
   302  	cmpult	r22,	r24,	r24	C U0 carry from sum
   303  	ldq	r9,	32(r17)		C L1
   304  
   305  	umulh	r10,	r19,	r5	C U1 #02
   306  	addq	r25,	r26,	r23	C U0 sum 2 mul's
   307  	stq	r20,	0(r16)		C L0
   308  	stq	r6,	8(r16)		C L1
   309  
   310  	mulq	r11,	r19,	r6	C U1 #03
   311  	cmpult	r23,	r26,	r25	C U0 carry from sum
   312  	stq	r7,	16(r16)		C L0
   313  	stq	r8,	24(r16)		C L1
   314  
   315  	umulh	r11,	r19,	r7	C U1 #04
   316  	bis	r31,	r31,	r31	C L0 st slosh
   317  	bis	r31,	r31,	r31	C L1 st slosh
   318  	addq	r27,	r28,	r28	C U0 sum 2 mul's
   319  
   320  	mulq	r12,	r19,	r8	C U1 #05
   321  	cmpult	r28,	r27,	r15	C L0 carry from sum
   322  	lda	r16,	64(r16)		C L1 bookkeeping
   323  	addq	r13,	r29,	r13	C U0 start carry cascade
   324  
   325  	umulh	r12,	r19,	r21	C U1 #06
   326  	beq	r13,	$fix0		C U0
   327  $ret0:	addq	r22,	r14,	r26	C L0
   328  	ldq	r10,	40(r17)		C L1
   329  
   330  	mulq	r9,	r19,	r22	C U1 #07
   331  	beq	r26,	$fix1		C U0
   332  $ret1:	addq	r23,	r24,	r27	C L0
   333  	ldq	r11,	48(r17)		C L1
   334  
   335  	umulh	r9,	r19,	r23	C U1 #08
   336  	beq	r27,	$fix2		C U0
   337  $ret2:	addq	r28,	r25,	r28	C L0
   338  	ldq	r12,	56(r17)		C L1
   339  
   340  	mulq	r10,	r19,	r24	C U1 #09
   341  	beq	r28,	$fix3		C U0
   342  $ret3:	addq	r1,	r2,	r20	C L0 sum 2 mul's
   343  	ldq	r9,	64(r17)		C L1
   344  
   345  	addq	r3,	r4,	r2	C L0 #10 2 mul's
   346  	bis	r31,	r31,	r31	C U1 mul hole
   347  	lda	r17,	64(r17)		C L1 bookkeeping
   348  	cmpult	r20,	r1,	r29	C U0 carry from sum
   349  
   350  	umulh	r10,	r19,	r25	C U1 #11
   351  	cmpult	r2,	r4,	r4	C U0 carry from sum
   352  	stq	r13,	-32(r16)	C L0
   353  	stq	r26,	-24(r16)	C L1
   354  
   355  	mulq	r11,	r19,	r26	C U1 #12
   356  	addq	r5,	r6,	r14	C U0 sum 2 mul's
   357  	stq	r27,	-16(r16)	C L0
   358  	stq	r28,	-8(r16)		C L1
   359  
   360  	umulh	r11,	r19,	r27	C U1 #13
   361  	bis	r31,	r31,	r31	C L0 st slosh
   362  	bis	r31,	r31,	r31	C L1 st slosh
   363  	cmpult	r14,	r6,	r3	C U0 carry from sum
   364  $L_middle_of_unrolled_loop:
   365  	mulq	r12,	r19,	r28	C U1 #14
   366  	addq	r7,	r3,	r5	C L0 eat carry
   367  	addq	r20,	r15,	r20	C U0 carry cascade
   368  	ldq	r10,	8(r17)		C L1
   369  
   370  	umulh	r12,	r19,	r1	C U1 #15
   371  	beq	r20,	$fix4		C U0
   372  $ret4:	addq	r2,	r29,	r6	C L0
   373  	ldq	r11,	16(r17)		C L1
   374  
   375  	mulq	r9,	r19,	r2	C U1 #16
   376  	beq	r6,	$fix5		C U0
   377  $ret5:	addq	r14,	r4,	r7	C L0
   378  	ldq	r12,	24(r17)		C L1
   379  
   380  	umulh	r9,	r19,	r3	C U1 #17
   381  	beq	r7,	$fix6		C U0
   382  $ret6:	addq	r5,	r8,	r8	C L0 sum 2
   383  	addq	r21,	r22,	r13	C L1 sum 2 mul's
   384  
   385  	mulq	r10,	r19,	r4	C U1 #18
   386  	addq	r23,	r24,	r22	C L0 sum 2 mul's
   387  	cmpult	r13,	r21,	r14	C L1 carry from sum
   388  	bgt	r18,	$Loop		C U0
   389  C ---------------------------------------------------------------
   390  $Lend:
   391  	cmpult	r8,	r5,	r29	C L0 carry from last bunch
   392  	cmpult	r22,	r24,	r24	C U0 carry from sum
   393  
   394  	umulh	r10,	r19,	r5	C U1 #02
   395  	addq	r25,	r26,	r23	C U0 sum 2 mul's
   396  	stq	r20,	0(r16)		C L0
   397  	stq	r6,	8(r16)		C L1
   398  
   399  	mulq	r11,	r19,	r6	C U1 #03
   400  	cmpult	r23,	r26,	r25	C U0 carry from sum
   401  	stq	r7,	16(r16)		C L0
   402  	stq	r8,	24(r16)		C L1
   403  
   404  	umulh	r11,	r19,	r7	C U1 #04
   405  	addq	r27,	r28,	r28	C U0 sum 2 mul's
   406  
   407  	mulq	r12,	r19,	r8	C U1 #05
   408  	cmpult	r28,	r27,	r15	C L0 carry from sum
   409  	lda	r16,	64(r16)		C L1 bookkeeping
   410  	addq	r13,	r29,	r13	C U0 start carry cascade
   411  
   412  	umulh	r12,	r19,	r21	C U1 #06
   413  	beq	r13,	$fix0c		C U0
   414  $ret0c:	addq	r22,	r14,	r26	C L0
   415  	beq	r26,	$fix1c		C U0
   416  $ret1c:	addq	r23,	r24,	r27	C L0
   417  	beq	r27,	$fix2c		C U0
   418  $ret2c:	addq	r28,	r25,	r28	C L0
   419  	beq	r28,	$fix3c		C U0
   420  $ret3c:	addq	r1,	r2,	r20	C L0 sum 2 mul's
   421  	addq	r3,	r4,	r2	C L0 #10 2 mul's
   422  	lda	r17,	64(r17)		C L1 bookkeeping
   423  	cmpult	r20,	r1,	r29	C U0 carry from sum
   424  	cmpult	r2,	r4,	r4	C U0 carry from sum
   425  	stq	r13,	-32(r16)	C L0
   426  	stq	r26,	-24(r16)	C L1
   427  	addq	r5,	r6,	r14	C U0 sum 2 mul's
   428  	stq	r27,	-16(r16)	C L0
   429  	stq	r28,	-8(r16)		C L1
   430  	cmpult	r14,	r6,	r3	C U0 carry from sum
   431  	addq	r7,	r3,	r5	C L0 eat carry
   432  	addq	r20,	r15,	r20	C U0 carry cascade
   433  	beq	r20,	$fix4c		C U0
   434  $ret4c:	addq	r2,	r29,	r6	C L0
   435  	beq	r6,	$fix5c		C U0
   436  $ret5c:	addq	r14,	r4,	r7	C L0
   437  	beq	r7,	$fix6c		C U0
   438  $ret6c:	addq	r5,	r8,	r8	C L0 sum 2
   439  	cmpult	r8,	r5,	r29	C L0 carry from last bunch
   440  	stq	r20,	0(r16)		C L0
   441  	stq	r6,	8(r16)		C L1
   442  	stq	r7,	16(r16)		C L0
   443  	stq	r8,	24(r16)		C L1
   444  	addq	r29,	r21,	r0
   445  
   446  	ldq	r26,	0(r30)
   447  	ldq	r9,	8(r30)
   448  	ldq	r10,	16(r30)
   449  	ldq	r11,	24(r30)
   450  	ldq	r12,	32(r30)
   451  	ldq	r13,	40(r30)
   452  	ldq	r14,	48(r30)
   453  	ldq	r15,	56(r30)
   454  	ldq	r29,	64(r30)
   455  	lda	r30,	224(r30)
   456  	ret	r31,	(r26),	1
   457  
   458  C $fix0w:	bis	r14,	r29,	r14	C join carries
   459  C	br	r31,	$ret0w
   460  $fix1w:	bis	r24,	r14,	r24	C join carries
   461  	br	r31,	$ret1w
   462  $fix2w:	bis	r25,	r24,	r25	C join carries
   463  	br	r31,	$ret2w
   464  $fix3w:	bis	r15,	r25,	r15	C join carries
   465  	br	r31,	$ret3w
   466  $fix0:	bis	r14,	r29,	r14	C join carries
   467  	br	r31,	$ret0
   468  $fix1:	bis	r24,	r14,	r24	C join carries
   469  	br	r31,	$ret1
   470  $fix2:	bis	r25,	r24,	r25	C join carries
   471  	br	r31,	$ret2
   472  $fix3:	bis	r15,	r25,	r15	C join carries
   473  	br	r31,	$ret3
   474  $fix4:	bis	r29,	r15,	r29	C join carries
   475  	br	r31,	$ret4
   476  $fix5:	bis	r4,	r29,	r4	C join carries
   477  	br	r31,	$ret5
   478  $fix6:	addq	r5,	r4,	r5	C can't carry twice!
   479  	br	r31,	$ret6
   480  $fix0c:	bis	r14,	r29,	r14	C join carries
   481  	br	r31,	$ret0c
   482  $fix1c:	bis	r24,	r14,	r24	C join carries
   483  	br	r31,	$ret1c
   484  $fix2c:	bis	r25,	r24,	r25	C join carries
   485  	br	r31,	$ret2c
   486  $fix3c:	bis	r15,	r25,	r15	C join carries
   487  	br	r31,	$ret3c
   488  $fix4c:	bis	r29,	r15,	r29	C join carries
   489  	br	r31,	$ret4c
   490  $fix5c:	bis	r4,	r29,	r4	C join carries
   491  	br	r31,	$ret5c
   492  $fix6c:	addq	r5,	r4,	r5	C can't carry twice!
   493  	br	r31,	$ret6c
   494  
   495  EPILOGUE(mpn_mul_1)
   496  ASM_END()