github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/sqr_basecase.asm (about)

     1  dnl  PowerPC-64 mpn_sqr_basecase.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
     6  dnl  Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  C                  cycles/limb
    37  C POWER3/PPC630         6-18
    38  C POWER4/PPC970          8
    39  C POWER5                 8
    40  C POWER6                16.25
    41  C POWER7                 3.77
    42  
    43  C NOTES
    44  C  * This is very crude, cleanup!
    45  C  * Try to reduce the number of needed live registers.
    46  C  * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4.  The
    47  C    cost will be more live registers.
    48  C  * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
    49  C    size a lot and speed things up perhaps 25%.
    50  C  * Use computed goto in order to compress the code.
    51  C  * Implement a larger final corner.
    52  C  * Schedule callee-saves register saves into other insns.  This could save
    53  C    about 5 cycles/call.  (We cannot analogously optimise the restores, since
    54  C    the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
    55  C  * Should the alternating std/adde sequences be split?  Some pipelines handle
    56  C    adde poorly, and might sequentialise all these instructions.
    57  C  * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
    58  C    adjacent integer multiply insns.  Except for the multiply insns, the code
    59  C    was not carefully optimised for POWER6 or any other CPU.
    60  C  * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
    61  
    62  C INPUT PARAMETERS
    63  define(`rp', `r3')
    64  define(`up', `r4')
    65  define(`n',  `r5')
    66  
    67  define(`rp_outer', `r25')
    68  define(`up_outer', `r21')
    69  define(`rp_saved', `r22')
    70  define(`up_saved', `r23')
    71  define(`n_saved',  `r24')
    72  
    73  ASM_START()
    74  PROLOGUE(mpn_sqr_basecase)
    75  	cmpdi	cr0, n, 2
    76  	bge	cr0, L(ge2)
    77  	ld	r5, 0(up)	C n = 1
    78  	nop
    79  	mulld	r8, r5, r5	C weight 0
    80  	mulhdu	r9, r5, r5	C weight 1
    81  	std	r8, 0(rp)
    82  	std	r9, 8(rp)
    83  	blr
    84  	ALIGN(16)
    85  L(ge2):	bgt	cr0, L(gt2)
    86  	ld	r0, 0(up)	C n = 2
    87  	nop
    88  	mulld	r8, r0, r0	C u0 * u0
    89  	mulhdu	r9, r0, r0	C u0 * u0
    90  	ld	r6, 8(up)
    91  	mulld	r10, r6, r6	C u1 * u1
    92  	mulhdu	r11, r6, r6	C u1 * u1
    93  	mulld	r4, r6, r0	C u1 * u0
    94  	mulhdu	r5, r6, r0	C u1 * u0
    95  	addc	r4, r4, r4
    96  	adde	r5, r5, r5
    97  	addze	r11, r11
    98  	addc	r9, r9, r4
    99  	adde	r10, r10, r5
   100  	addze	r11, r11
   101  	std	r8, 0(rp)
   102  	std	r9, 8(rp)
   103  	std	r10, 16(rp)
   104  	std	r11, 24(rp)
   105  	blr
   106  
   107  	ALIGN(16)
   108  L(gt2):	std	r31,  -8(r1)
   109  	std	r30, -16(r1)
   110  	std	r29, -24(r1)
   111  	std	r28, -32(r1)
   112  	std	r27, -40(r1)
   113  	std	r26, -48(r1)
   114  	std	r25, -56(r1)
   115  	std	r24, -64(r1)
   116  	std	r23, -72(r1)
   117  	std	r22, -80(r1)
   118  	std	r21, -88(r1)
   119  
   120  	mr	rp_saved, rp
   121  	mr	up_saved, up
   122  	mr	n_saved, n
   123  	mr	rp_outer, rp
   124  	mr	up_outer, up
   125  
   126  	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
   127  	cmpdi	cr6, r0, 2
   128  	addic	r7, n, 2	C compute count...
   129  	srdi	r7, r7, 2	C ...for ctr
   130  	mtctr	r7		C copy count into ctr
   131  	beq-	cr0, L(b0)
   132  	blt-	cr6, L(b1)
   133  	beq-	cr6, L(b2)
   134  
   135  L(b3):	ld	r6, 0(up)
   136  	ld	r9, 8(up)
   137  	ld	r27, 16(up)
   138  	addi	up, up, 24
   139  	li	r12, 0		C carry limb
   140  	bdz	L(em3)
   141  
   142  	ALIGN(16)
   143  L(tm3):	mulld	r0, r9, r6
   144  	mulhdu	r26, r9, r6
   145  	mulld	r7, r27, r6
   146  	mulhdu	r8, r27, r6
   147  	ld	r9, 0(up)
   148  	ld	r27, 8(up)
   149  	adde	r0, r0, r12
   150  	adde	r7, r7, r26
   151  	mulld	r26, r9, r6
   152  	mulhdu	r10, r9, r6
   153  	mulld	r11, r27, r6
   154  	mulhdu	r12, r27, r6
   155  	ld	r9, 16(up)
   156  	ld	r27, 24(up)
   157  	std	r0, 8(rp)
   158  	adde	r26, r26, r8
   159  	std	r7, 16(rp)
   160  	adde	r11, r11, r10
   161  	std	r26, 24(rp)
   162  	addi	up, up, 32
   163  	std	r11, 32(rp)
   164  	addi	rp, rp, 32
   165  	bdnz	L(tm3)
   166  
   167  L(em3):	mulld	r0, r9, r6
   168  	mulhdu	r26, r9, r6
   169  	mulld	r7, r27, r6
   170  	mulhdu	r8, r27, r6
   171  	adde	r0, r0, r12
   172  	adde	r7, r7, r26
   173  	std	r0, 8(rp)
   174  	std	r7, 16(rp)
   175  	addze	r8, r8
   176  	std	r8, 24(rp)
   177  	addi	n, n, 2
   178  	b	L(outer_loop)
   179  
   180  L(b0):	ld	r6, 0(up)
   181  	ld	r27, 8(up)
   182  	mulld	r7, r27, r6
   183  	mulhdu	r12, r27, r6
   184  	std	r7, 8(rp)
   185  	addi	rp, rp, 8
   186  	ld	r9, 16(up)
   187  	ld	r27, 24(up)
   188  	addi	up, up, 32
   189  	bdz	L(em0)
   190  
   191  	ALIGN(16)
   192  L(tm0):	mulld	r0, r9, r6
   193  	mulhdu	r26, r9, r6
   194  	mulld	r7, r27, r6
   195  	mulhdu	r8, r27, r6
   196  	ld	r9, 0(up)
   197  	ld	r27, 8(up)
   198  	adde	r0, r0, r12
   199  	adde	r7, r7, r26
   200  	mulld	r26, r9, r6
   201  	mulhdu	r10, r9, r6
   202  	mulld	r11, r27, r6
   203  	mulhdu	r12, r27, r6
   204  	ld	r9, 16(up)
   205  	ld	r27, 24(up)
   206  	std	r0, 8(rp)
   207  	adde	r26, r26, r8
   208  	std	r7, 16(rp)
   209  	adde	r11, r11, r10
   210  	std	r26, 24(rp)
   211  	addi	up, up, 32
   212  	std	r11, 32(rp)
   213  	addi	rp, rp, 32
   214  	bdnz	L(tm0)
   215  
   216  L(em0):	mulld	r0, r9, r6
   217  	mulhdu	r26, r9, r6
   218  	mulld	r7, r27, r6
   219  	mulhdu	r8, r27, r6
   220  	adde	r0, r0, r12
   221  	adde	r7, r7, r26
   222  	std	r0, 8(rp)
   223  	std	r7, 16(rp)
   224  	addze	r8, r8
   225  	std	r8, 24(rp)
   226  	addi	n, n, 2
   227  	b	L(outer_loop_ent_2)
   228  
   229  L(b1):	ld	r6, 0(up)
   230  	ld	r9, 8(up)
   231  	ld	r27, 16(up)
   232  	mulld	r0, r9, r6
   233  	mulhdu	r26, r9, r6
   234  	mulld	r7, r27, r6
   235  	mulhdu	r12, r27, r6
   236  	addc	r7, r7, r26
   237  	std	r0, 8(rp)
   238  	std	r7, 16(rp)
   239  	addi	rp, rp, 16
   240  	ld	r9, 24(up)
   241  	ld	r27, 32(up)
   242  	addi	up, up, 40
   243  	bdz	L(em1)
   244  
   245  	ALIGN(16)
   246  L(tm1):	mulld	r0, r9, r6
   247  	mulhdu	r26, r9, r6
   248  	mulld	r7, r27, r6
   249  	mulhdu	r8, r27, r6
   250  	ld	r9, 0(up)
   251  	ld	r27, 8(up)
   252  	adde	r0, r0, r12
   253  	adde	r7, r7, r26
   254  	mulld	r26, r9, r6
   255  	mulhdu	r10, r9, r6
   256  	mulld	r11, r27, r6
   257  	mulhdu	r12, r27, r6
   258  	ld	r9, 16(up)
   259  	ld	r27, 24(up)
   260  	std	r0, 8(rp)
   261  	adde	r26, r26, r8
   262  	std	r7, 16(rp)
   263  	adde	r11, r11, r10
   264  	std	r26, 24(rp)
   265  	addi	up, up, 32
   266  	std	r11, 32(rp)
   267  	addi	rp, rp, 32
   268  	bdnz	L(tm1)
   269  
   270  L(em1):	mulld	r0, r9, r6
   271  	mulhdu	r26, r9, r6
   272  	mulld	r7, r27, r6
   273  	mulhdu	r8, r27, r6
   274  	adde	r0, r0, r12
   275  	adde	r7, r7, r26
   276  	std	r0, 8(rp)
   277  	std	r7, 16(rp)
   278  	addze	r8, r8
   279  	std	r8, 24(rp)
   280  	addi	n, n, 2
   281  	b	L(outer_loop_ent_3)
   282  
   283  L(b2):	addi	r7, r7, -1	C FIXME
   284  	mtctr	r7		C FIXME
   285  	ld	r6, 0(up)
   286  	ld	r9, 8(up)
   287  	ld	r27, 16(up)
   288  	mulld	r0, r9, r6
   289  	mulhdu	r26, r9, r6
   290  	mulld	r7, r27, r6
   291  	mulhdu	r8, r27, r6
   292  	ld	r9, 24(up)
   293  	mulld	r11, r9, r6
   294  	mulhdu	r10, r9, r6
   295  	addc	r7, r7, r26
   296  	adde	r11, r11, r8
   297  	addze	r12, r10
   298  	std	r0, 8(rp)
   299  	std	r7, 16(rp)
   300  	std	r11, 24(rp)
   301  	addi	rp, rp, 24
   302  	ld	r9, 32(up)
   303  	ld	r27, 40(up)
   304  	addi	up, up, 48
   305  	bdz	L(em2)
   306  
   307  	ALIGN(16)
   308  L(tm2):	mulld	r0, r9, r6
   309  	mulhdu	r26, r9, r6
   310  	mulld	r7, r27, r6
   311  	mulhdu	r8, r27, r6
   312  	ld	r9, 0(up)
   313  	ld	r27, 8(up)
   314  	adde	r0, r0, r12
   315  	adde	r7, r7, r26
   316  	mulld	r26, r9, r6
   317  	mulhdu	r10, r9, r6
   318  	mulld	r11, r27, r6
   319  	mulhdu	r12, r27, r6
   320  	ld	r9, 16(up)
   321  	ld	r27, 24(up)
   322  	std	r0, 8(rp)
   323  	adde	r26, r26, r8
   324  	std	r7, 16(rp)
   325  	adde	r11, r11, r10
   326  	std	r26, 24(rp)
   327  	addi	up, up, 32
   328  	std	r11, 32(rp)
   329  	addi	rp, rp, 32
   330  	bdnz	L(tm2)
   331  
   332  L(em2):	mulld	r0, r9, r6
   333  	mulhdu	r26, r9, r6
   334  	mulld	r7, r27, r6
   335  	mulhdu	r8, r27, r6
   336  	adde	r0, r0, r12
   337  	adde	r7, r7, r26
   338  	std	r0, 8(rp)
   339  	std	r7, 16(rp)
   340  	addze	r8, r8
   341  	std	r8, 24(rp)
   342  	addi	n, n, 2
   343  	b	L(outer_loop_ent_0)
   344  
   345  
   346  L(outer_loop):
   347  	addi	n, n, -1
   348  	addi	up_outer, up_outer, 8
   349  	addi	rp_outer, rp_outer, 16
   350  
   351  	mr	up, up_outer
   352  	addi	rp, rp_outer, 8
   353  
   354  	srdi	r0, n, 2
   355  	mtctr	r0
   356  
   357  	bdz	L(outer_end)
   358  
   359  	ld	r6, 0(up)
   360  	ld	r9, 8(up)
   361  	ld	r27, 16(up)
   362  	mulld	r0, r9, r6
   363  	mulhdu	r26, r9, r6
   364  	mulld	r7, r27, r6
   365  	mulhdu	r8, r27, r6
   366  	ld	r9, 24(up)
   367  	ld	r28, 0(rp)
   368  	ld	r29, 8(rp)
   369  	ld	r30, 16(rp)
   370  	mulld	r11, r9, r6
   371  	mulhdu	r10, r9, r6
   372  	addc	r7, r7, r26
   373  	adde	r11, r11, r8
   374  	addze	r12, r10
   375  	addc	r0, r0, r28
   376  	std	r0, 0(rp)
   377  	adde	r7, r7, r29
   378  	std	r7, 8(rp)
   379  	adde	r11, r11, r30
   380  	std	r11, 16(rp)
   381  	addi	rp, rp, 24
   382  	ld	r9, 32(up)
   383  	ld	r27, 40(up)
   384  	addi	up, up, 48
   385  	bdz	L(ea1)
   386  
   387  	ALIGN(16)
   388  L(ta1):	mulld	r0, r9, r6
   389  	mulhdu	r26, r9, r6	C 9
   390  	mulld	r7, r27, r6
   391  	mulhdu	r8, r27, r6	C 27
   392  	ld	r9, 0(up)
   393  	ld	r28, 0(rp)
   394  	ld	r27, 8(up)
   395  	ld	r29, 8(rp)
   396  	adde	r0, r0, r12	C 0 12
   397  	adde	r7, r7, r26	C 5 7
   398  	mulld	r26, r9, r6
   399  	mulhdu	r10, r9, r6	C 9
   400  	mulld	r11, r27, r6
   401  	mulhdu	r12, r27, r6	C 27
   402  	ld	r9, 16(up)
   403  	ld	r30, 16(rp)
   404  	ld	r27, 24(up)
   405  	ld	r31, 24(rp)
   406  	adde	r26, r26, r8	C 8 5
   407  	adde	r11, r11, r10	C 10 11
   408  	addze	r12, r12	C 12
   409  	addc	r0, r0, r28	C 0 28
   410  	std	r0, 0(rp)	C 0
   411  	adde	r7, r7, r29	C 7 29
   412  	std	r7, 8(rp)	C 7
   413  	adde	r26, r26, r30	C 5 30
   414  	std	r26, 16(rp)	C 5
   415  	adde	r11, r11, r31	C 11 31
   416  	std	r11, 24(rp)	C 11
   417  	addi	up, up, 32
   418  	addi	rp, rp, 32
   419  	bdnz	L(ta1)
   420  
   421  L(ea1):	mulld	r0, r9, r6
   422  	mulhdu	r26, r9, r6
   423  	mulld	r7, r27, r6
   424  	mulhdu	r8, r27, r6
   425  	ld	r28, 0(rp)
   426  	ld	r29, 8(rp)
   427  	adde	r0, r0, r12
   428  	adde	r7, r7, r26
   429  	addze	r8, r8
   430  	addc	r0, r0, r28
   431  	std	r0, 0(rp)
   432  	adde	r7, r7, r29
   433  	std	r7, 8(rp)
   434  	addze	r8, r8
   435  	std	r8, 16(rp)
   436  
   437  L(outer_loop_ent_0):
   438  	addi	n, n, -1
   439  	addi	up_outer, up_outer, 8
   440  	addi	rp_outer, rp_outer, 16
   441  
   442  	mr	up, up_outer
   443  	addi	rp, rp_outer, 8
   444  
   445  	srdi	r0, n, 2
   446  	mtctr	r0
   447  
   448  	ld	r6, 0(up)
   449  	ld	r9, 8(up)
   450  	ld	r27, 16(up)
   451  	ld	r28, 0(rp)
   452  	ld	r29, 8(rp)
   453  	mulld	r0, r9, r6
   454  	mulhdu	r26, r9, r6
   455  	mulld	r7, r27, r6
   456  	mulhdu	r8, r27, r6
   457  	addc	r0, r0, r28
   458  	adde	r7, r7, r26
   459  	addze	r12, r8
   460  	std	r0, 0(rp)
   461  	adde	r7, r7, r29
   462  	std	r7, 8(rp)
   463  	addi	rp, rp, 16
   464  	ld	r9, 24(up)
   465  	ld	r27, 32(up)
   466  	addi	up, up, 40
   467  	bdz	L(ea0)
   468  
   469  	ALIGN(16)
   470  L(ta0):	mulld	r0, r9, r6
   471  	mulhdu	r26, r9, r6	C 9
   472  	mulld	r7, r27, r6
   473  	mulhdu	r8, r27, r6	C 27
   474  	ld	r9, 0(up)
   475  	ld	r28, 0(rp)
   476  	ld	r27, 8(up)
   477  	ld	r29, 8(rp)
   478  	adde	r0, r0, r12	C 0 12
   479  	adde	r7, r7, r26	C 5 7
   480  	mulld	r26, r9, r6
   481  	mulhdu	r10, r9, r6	C 9
   482  	mulld	r11, r27, r6
   483  	mulhdu	r12, r27, r6	C 27
   484  	ld	r9, 16(up)
   485  	ld	r30, 16(rp)
   486  	ld	r27, 24(up)
   487  	ld	r31, 24(rp)
   488  	adde	r26, r26, r8	C 8 5
   489  	adde	r11, r11, r10	C 10 11
   490  	addze	r12, r12	C 12
   491  	addc	r0, r0, r28	C 0 28
   492  	std	r0, 0(rp)	C 0
   493  	adde	r7, r7, r29	C 7 29
   494  	std	r7, 8(rp)	C 7
   495  	adde	r26, r26, r30	C 5 30
   496  	std	r26, 16(rp)	C 5
   497  	adde	r11, r11, r31	C 11 31
   498  	std	r11, 24(rp)	C 11
   499  	addi	up, up, 32
   500  	addi	rp, rp, 32
   501  	bdnz	L(ta0)
   502  
   503  L(ea0):	mulld	r0, r9, r6
   504  	mulhdu	r26, r9, r6
   505  	mulld	r7, r27, r6
   506  	mulhdu	r8, r27, r6
   507  	ld	r28, 0(rp)
   508  	ld	r29, 8(rp)
   509  	adde	r0, r0, r12
   510  	adde	r7, r7, r26
   511  	addze	r8, r8
   512  	addc	r0, r0, r28
   513  	std	r0, 0(rp)
   514  	adde	r7, r7, r29
   515  	std	r7, 8(rp)
   516  	addze	r8, r8
   517  	std	r8, 16(rp)
   518  
   519  L(outer_loop_ent_3):
   520  	addi	n, n, -1
   521  	addi	up_outer, up_outer, 8
   522  	addi	rp_outer, rp_outer, 16
   523  
   524  	mr	up, up_outer
   525  	addi	rp, rp_outer, 8
   526  
   527  	srdi	r0, n, 2
   528  	mtctr	r0
   529  
   530  	ld	r6, 0(up)
   531  	ld	r9, 8(up)
   532  	ld	r28, 0(rp)
   533  	mulld	r0, r9, r6
   534  	mulhdu	r12, r9, r6
   535  	addc	r0, r0, r28
   536  	std	r0, 0(rp)
   537  	addi	rp, rp, 8
   538  	ld	r9, 16(up)
   539  	ld	r27, 24(up)
   540  	addi	up, up, 32
   541  	bdz	L(ea3)
   542  
   543  	ALIGN(16)
   544  L(ta3):	mulld	r0, r9, r6
   545  	mulhdu	r26, r9, r6	C 9
   546  	mulld	r7, r27, r6
   547  	mulhdu	r8, r27, r6	C 27
   548  	ld	r9, 0(up)
   549  	ld	r28, 0(rp)
   550  	ld	r27, 8(up)
   551  	ld	r29, 8(rp)
   552  	adde	r0, r0, r12	C 0 12
   553  	adde	r7, r7, r26	C 5 7
   554  	mulld	r26, r9, r6
   555  	mulhdu	r10, r9, r6	C 9
   556  	mulld	r11, r27, r6
   557  	mulhdu	r12, r27, r6	C 27
   558  	ld	r9, 16(up)
   559  	ld	r30, 16(rp)
   560  	ld	r27, 24(up)
   561  	ld	r31, 24(rp)
   562  	adde	r26, r26, r8	C 8 5
   563  	adde	r11, r11, r10	C 10 11
   564  	addze	r12, r12	C 12
   565  	addc	r0, r0, r28	C 0 28
   566  	std	r0, 0(rp)	C 0
   567  	adde	r7, r7, r29	C 7 29
   568  	std	r7, 8(rp)	C 7
   569  	adde	r26, r26, r30	C 5 30
   570  	std	r26, 16(rp)	C 5
   571  	adde	r11, r11, r31	C 11 31
   572  	std	r11, 24(rp)	C 11
   573  	addi	up, up, 32
   574  	addi	rp, rp, 32
   575  	bdnz	L(ta3)
   576  
   577  L(ea3):	mulld	r0, r9, r6
   578  	mulhdu	r26, r9, r6
   579  	mulld	r7, r27, r6
   580  	mulhdu	r8, r27, r6
   581  	ld	r28, 0(rp)
   582  	ld	r29, 8(rp)
   583  	adde	r0, r0, r12
   584  	adde	r7, r7, r26
   585  	addze	r8, r8
   586  	addc	r0, r0, r28
   587  	std	r0, 0(rp)
   588  	adde	r7, r7, r29
   589  	std	r7, 8(rp)
   590  	addze	r8, r8
   591  	std	r8, 16(rp)
   592  
   593  
   594  L(outer_loop_ent_2):
   595  	addi	n, n, -1
   596  	addi	up_outer, up_outer, 8
   597  	addi	rp_outer, rp_outer, 16
   598  
   599  	mr	up, up_outer
   600  	addi	rp, rp_outer, 8
   601  
   602  	srdi	r0, n, 2
   603  	mtctr	r0
   604  
   605  	addic	r0, r0, 0
   606  	li	r12, 0		C cy_limb = 0
   607  	ld	r6, 0(up)
   608  	ld	r9, 8(up)
   609  	ld	r27, 16(up)
   610  	bdz	L(ea2)
   611  	addi	up, up, 24
   612  
   613  	ALIGN(16)
   614  L(ta2):	mulld	r0, r9, r6
   615  	mulhdu	r26, r9, r6	C 9
   616  	mulld	r7, r27, r6
   617  	mulhdu	r8, r27, r6	C 27
   618  	ld	r9, 0(up)
   619  	ld	r28, 0(rp)
   620  	ld	r27, 8(up)
   621  	ld	r29, 8(rp)
   622  	adde	r0, r0, r12	C 0 12
   623  	adde	r7, r7, r26	C 5 7
   624  	mulld	r26, r9, r6
   625  	mulhdu	r10, r9, r6	C 9
   626  	mulld	r11, r27, r6
   627  	mulhdu	r12, r27, r6	C 27
   628  	ld	r9, 16(up)
   629  	ld	r30, 16(rp)
   630  	ld	r27, 24(up)
   631  	ld	r31, 24(rp)
   632  	adde	r26, r26, r8	C 8 5
   633  	adde	r11, r11, r10	C 10 11
   634  	addze	r12, r12	C 12
   635  	addc	r0, r0, r28	C 0 28
   636  	std	r0, 0(rp)	C 0
   637  	adde	r7, r7, r29	C 7 29
   638  	std	r7, 8(rp)	C 7
   639  	adde	r26, r26, r30	C 5 30
   640  	std	r26, 16(rp)	C 5
   641  	adde	r11, r11, r31	C 11 31
   642  	std	r11, 24(rp)	C 11
   643  	addi	up, up, 32
   644  	addi	rp, rp, 32
   645  	bdnz	L(ta2)
   646  
   647  L(ea2):	mulld	r0, r9, r6
   648  	mulhdu	r26, r9, r6
   649  	mulld	r7, r27, r6
   650  	mulhdu	r8, r27, r6
   651  	ld	r28, 0(rp)
   652  	ld	r29, 8(rp)
   653  	adde	r0, r0, r12
   654  	adde	r7, r7, r26
   655  	addze	r8, r8
   656  	addc	r0, r0, r28
   657  	std	r0, 0(rp)
   658  	adde	r7, r7, r29
   659  	std	r7, 8(rp)
   660  	addze	r8, r8
   661  	std	r8, 16(rp)
   662  
   663  	b	L(outer_loop)
   664  
   665  L(outer_end):
   666  	ld	r6, 0(up)
   667  	ld	r9, 8(up)
   668  	ld	r11, 0(rp)
   669  	mulld	r0, r9, r6
   670  	mulhdu	r8, r9, r6
   671  	addc	r0, r0, r11
   672  	std	r0, 0(rp)
   673  	addze	r8, r8
   674  	std	r8, 8(rp)
   675  
   676  define(`rp',  `rp_saved')
   677  define(`up',  `r5')
   678  define(`n',   `r6')
   679  define(`climb',	`r0')
   680  
   681  	addi	r4, rp_saved, 8
   682  	mr	r5, up_saved
   683  	mr	r6, n_saved
   684  
   685  	rldicl.	r0, n, 0,62		C r0 = n & 3, set cr0
   686  	cmpdi	cr6, r0, 2
   687  	addi	n, n, 2			C compute count...
   688  	srdi	n, n, 2			C ...for ctr
   689  	mtctr	n			C put loop count into ctr
   690  	beq	cr0, L(xb0)
   691  	blt	cr6, L(xb1)
   692  	beq	cr6, L(xb2)
   693  
   694  L(xb3):	ld	r6,   0(up)
   695  	ld	r7,   8(up)
   696  	ld	r12, 16(up)
   697  	addi	up, up, 24
   698  	mulld	r24, r6, r6
   699  	mulhdu	r25, r6, r6
   700  	mulld	r26, r7, r7
   701  	mulhdu	r27, r7, r7
   702  	mulld	r28, r12, r12
   703  	mulhdu	r29, r12, r12
   704  	ld	r10,  8(rp)
   705  	ld	r11, 16(rp)
   706  	ld	r6,  24(rp)
   707  	ld	r7,  32(rp)
   708  	addc	r10, r10, r10
   709  	adde	r11, r11, r11
   710  	adde	r6, r6, r6
   711  	adde	r7, r7, r7
   712  	addze	climb, r29
   713  	addc	r10, r10, r25
   714  	adde	r11, r11, r26
   715  	adde	r6, r6, r27
   716  	adde	r7, r7, r28
   717  	std	r24,  0(rp)
   718  	std	r10,  8(rp)
   719  	std	r11, 16(rp)
   720  	std	r6,  24(rp)
   721  	std	r7,  32(rp)
   722  	addi	rp, rp, 40
   723  	bdnz	L(top)
   724  	b	L(end)
   725  
   726  L(xb2):	ld	r6,  0(up)
   727  	ld	r7,  8(up)
   728  	addi	up, up, 16
   729  	mulld	r24, r6, r6
   730  	mulhdu	r25, r6, r6
   731  	mulld	r26, r7, r7
   732  	mulhdu	r27, r7, r7
   733  	ld	r10,  8(rp)
   734  	ld	r11, 16(rp)
   735  	addc	r10, r10, r10
   736  	adde	r11, r11, r11
   737  	addze	climb, r27
   738  	addc	r10, r10, r25
   739  	adde	r11, r11, r26
   740  	std	r24,  0(rp)
   741  	std	r10,  8(rp)
   742  	std	r11, 16(rp)
   743  	addi	rp, rp, 24
   744  	bdnz	L(top)
   745  	b	L(end)
   746  
   747  L(xb0):	ld	r6,   0(up)
   748  	ld	r7,   8(up)
   749  	ld	r12, 16(up)
   750  	ld	r23, 24(up)
   751  	addi	up, up, 32
   752  	mulld	r24, r6, r6
   753  	mulhdu	r25, r6, r6
   754  	mulld	r26, r7, r7
   755  	mulhdu	r27, r7, r7
   756  	mulld	r28, r12, r12
   757  	mulhdu	r29, r12, r12
   758  	mulld	r30, r23, r23
   759  	mulhdu	r31, r23, r23
   760  	ld	r10,  8(rp)
   761  	ld	r11, 16(rp)
   762  	ld	r6,  24(rp)
   763  	ld	r7,  32(rp)
   764  	ld	r12, 40(rp)
   765  	ld	r23, 48(rp)
   766  	addc	r10, r10, r10
   767  	adde	r11, r11, r11
   768  	adde	r6, r6, r6
   769  	adde	r7, r7, r7
   770  	adde	r12, r12, r12
   771  	adde	r23, r23, r23
   772  	addze	climb, r31
   773  	std	r24,  0(rp)
   774  	addc	r10, r10, r25
   775  	std	r10,  8(rp)
   776  	adde	r11, r11, r26
   777  	std	r11, 16(rp)
   778  	adde	r6, r6, r27
   779  	std	r6,  24(rp)
   780  	adde	r7, r7, r28
   781  	std	r7,  32(rp)
   782  	adde	r12, r12, r29
   783  	std	r12, 40(rp)
   784  	adde	r23, r23, r30
   785  	std	r23, 48(rp)
   786  	addi	rp, rp, 56
   787  	bdnz	L(top)
   788  	b	L(end)
   789  
   790  L(xb1):	ld	r6,  0(up)
   791  	addi	up, up, 8
   792  	mulld	r24, r6, r6
   793  	mulhdu	climb, r6, r6
   794  	std	r24, 0(rp)
   795  	addic	rp, rp, 8		C clear carry as side-effect
   796  
   797  	ALIGN(32)
   798  L(top):	ld	r6,   0(up)
   799  	ld	r7,   8(up)
   800  	ld	r12, 16(up)
   801  	ld	r23, 24(up)
   802  	addi	up, up, 32
   803  	mulld	r24, r6, r6
   804  	mulhdu	r25, r6, r6
   805  	mulld	r26, r7, r7
   806  	mulhdu	r27, r7, r7
   807  	mulld	r28, r12, r12
   808  	mulhdu	r29, r12, r12
   809  	mulld	r30, r23, r23
   810  	mulhdu	r31, r23, r23
   811  	ld	r8,   0(rp)
   812  	ld	r9,   8(rp)
   813  	adde	r8, r8, r8
   814  	adde	r9, r9, r9
   815  	ld	r10, 16(rp)
   816  	ld	r11, 24(rp)
   817  	adde	r10, r10, r10
   818  	adde	r11, r11, r11
   819  	ld	r6,  32(rp)
   820  	ld	r7,  40(rp)
   821  	adde	r6, r6, r6
   822  	adde	r7, r7, r7
   823  	ld	r12, 48(rp)
   824  	ld	r23, 56(rp)
   825  	adde	r12, r12, r12
   826  	adde	r23, r23, r23
   827  	addze	r31, r31
   828  	addc	r8, r8, climb
   829  	std	r8,   0(rp)
   830  	adde	r9, r9, r24
   831  	std	r9,   8(rp)
   832  	adde	r10, r10, r25
   833  	std	r10, 16(rp)
   834  	adde	r11, r11, r26
   835  	std	r11, 24(rp)
   836  	adde	r6, r6, r27
   837  	std	r6,  32(rp)
   838  	adde	r7, r7, r28
   839  	std	r7,  40(rp)
   840  	adde	r12, r12, r29
   841  	std	r12, 48(rp)
   842  	adde	r23, r23, r30
   843  	std	r23, 56(rp)
   844  	mr	climb, r31
   845  	addi	rp, rp, 64
   846  	bdnz	L(top)
   847  
   848  L(end):	addze	climb, climb
   849  	std	climb,  0(rp)
   850  
   851  	ld	r31,  -8(r1)
   852  	ld	r30, -16(r1)
   853  	ld	r29, -24(r1)
   854  	ld	r28, -32(r1)
   855  	ld	r27, -40(r1)
   856  	ld	r26, -48(r1)
   857  	ld	r25, -56(r1)
   858  	ld	r24, -64(r1)
   859  	ld	r23, -72(r1)
   860  	ld	r22, -80(r1)
   861  	ld	r21, -88(r1)
   862  	blr
   863  EPILOGUE()