github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/p6/mul_basecase.asm (about)

     1  dnl  PowerPC-64 mpn_mul_basecase.
     2  
     3  dnl  Copyright 1999-2001, 2003-2006, 2008, 2010 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C		    cycles/limb
    34  C POWER3/PPC630		 ?
    35  C POWER4/PPC970		 ?
    36  C POWER5		 ?
    37  C POWER6		12.25
    38  
    39  C TODO
    40  C  * Reduce register usage.  At least 4 register less can be used.
    41  C  * Unroll more.  8-way unrolling would bring us to 10 c/l, 16-way unrolling
    42  C    would bring us to 9 c/l.
    43  C  * The bdz insns for b1 and b2 will never branch,
    44  C  * Align things better, perhaps by moving things like pointer updates from
    45  C    before to after loops.
    46  
    47  C INPUT PARAMETERS
    48  define(`rp', `r3')
    49  define(`up', `r4')
    50  define(`un', `r5')
    51  define(`vp', `r6')
    52  define(`vn', `r7')
    53  
    54  define(`v0',	   `r25')
    55  define(`outer_rp', `r22')
    56  define(`outer_up', `r23')
    57  
    58  ASM_START()
    59  PROLOGUE(mpn_mul_basecase)
    60  
    61  C Special code for un <= 2, for efficiency of these important cases,
    62  C and since it simplifies the default code.
    63  	cmpdi	cr0, un, 2
    64  	bgt	cr0, L(un_gt2)
    65  	cmpdi	cr6, vn, 1
    66  	ld	r7, 0(vp)
    67  	ld	r5, 0(up)
    68  	mulld	r8, r5, r7	C weight 0
    69  	mulhdu	r9, r5, r7	C weight 1
    70  	std	r8, 0(rp)
    71  	beq	cr0, L(2x)
    72  	std	r9, 8(rp)
    73  	blr
    74  	ALIGN(16)
    75  L(2x):	ld	r0, 8(up)
    76  	mulld	r8, r0, r7	C weight 1
    77  	mulhdu	r10, r0, r7	C weight 2
    78  	addc	r9, r9, r8
    79  	addze	r10, r10
    80  	bne	cr6, L(2x2)
    81  	std	r9, 8(rp)
    82  	std	r10, 16(rp)
    83  	blr
    84  	ALIGN(16)
    85  L(2x2):	ld	r6, 8(vp)
    86  	nop
    87  	mulld	r8, r5, r6	C weight 1
    88  	mulhdu	r11, r5, r6	C weight 2
    89  	mulld	r12, r0, r6	C weight 2
    90  	mulhdu	r0, r0, r6	C weight 3
    91  	addc	r9, r9, r8
    92  	std	r9, 8(rp)
    93  	adde	r11, r11, r10
    94  	addze	r0, r0
    95  	addc	r11, r11, r12
    96  	addze	r0, r0
    97  	std	r11, 16(rp)
    98  	std	r0, 24(rp)
    99  	blr
   100  
   101  L(un_gt2):
   102  	std	r31, -8(r1)
   103  	std	r30, -16(r1)
   104  	std	r29, -24(r1)
   105  	std	r28, -32(r1)
   106  	std	r27, -40(r1)
   107  	std	r26, -48(r1)
   108  	std	r25, -56(r1)
   109  	std	r24, -64(r1)
   110  	std	r23, -72(r1)
   111  	std	r22, -80(r1)
   112  	std	r21, -88(r1)
   113  	std	r20, -96(r1)
   114  
   115  	mr	outer_rp, rp
   116  	mr	outer_up, up
   117  
   118  	ld	v0, 0(vp)	C new v limb
   119  	addi	vp, vp, 8
   120  	ld	r26, 0(up)
   121  
   122  	rldicl.	r0, un, 0,62	C r0 = n & 3, set cr0
   123  	cmpdi	cr6, r0, 2
   124  	addi	un, un, 4	C compute count...
   125  	srdi	un, un, 2	C ...for ctr
   126  	mtctr	un		C copy inner loop count into ctr
   127  	beq	cr0, L(b0)
   128  	blt	cr6, L(b1)
   129  	beq	cr6, L(b2)
   130  
   131  
   132  	ALIGN(16)
   133  L(b3):
   134  	ld	r27, 8(up)
   135  	ld	r20, 16(up)
   136  	mulld	r0, r26, v0
   137  	mulhdu	r31, r26, v0
   138  	mulld	r24, r27, v0
   139  	mulhdu	r8, r27, v0
   140  	mulld	r9, r20, v0
   141  	mulhdu	r10, r20, v0
   142  	addc	r24, r24, r31
   143  	adde	r9, r9, r8
   144  	addze	r12, r10
   145  	std	r0, 0(rp)
   146  	std	r24, 8(rp)
   147  	std	r9, 16(rp)
   148  	addi	up, up, 16
   149  	addi	rp, rp, 16
   150  	bdz	L(end_m_3)
   151  
   152  	ALIGN(32)
   153  L(lo_m_3):
   154  	ld	r26, 8(up)
   155  	ld	r27, 16(up)
   156  	ld	r20, 24(up)
   157  	ld	r21, 32(up)
   158  	mulld	r0, r26, v0
   159  	mulhdu	r31, r26, v0
   160  	mulld	r24, r27, v0
   161  	mulhdu	r8, r27, v0
   162  	mulld	r9, r20, v0
   163  	mulhdu	r27, r20, v0
   164  	mulld	r11, r21, v0
   165  	mulhdu	r26, r21, v0
   166  	adde	r0, r0, r12
   167  	adde	r24, r24, r31
   168  	std	r0, 8(rp)
   169  	adde	r9, r9, r8
   170  	std	r24, 16(rp)
   171  	adde	r11, r11, r27
   172  	std	r9, 24(rp)
   173  	addi	up, up, 32
   174  	std	r11, 32(rp)
   175  	addi	rp, rp, 32
   176  	mr	r12, r26
   177  	bdnz	L(lo_m_3)
   178  
   179  	ALIGN(16)
   180  L(end_m_3):
   181  	addze	r12, r12
   182  	addic.	vn, vn, -1
   183  	std	r12, 8(rp)
   184  	beq	L(ret)
   185  
   186  	ALIGN(16)
   187  L(outer_lo_3):
   188  	mtctr	un		C copy inner loop count into ctr
   189  	addi	rp, outer_rp, 24
   190  	addi	up, outer_up, 16
   191  	addi	outer_rp, outer_rp, 8
   192  	ld	v0, 0(vp)	C new v limb
   193  	addi	vp, vp, 8
   194  	ld	r26, -16(up)
   195  	ld	r27, -8(up)
   196  	ld	r20, 0(up)
   197  	mulld	r0, r26, v0
   198  	mulhdu	r31, r26, v0
   199  	mulld	r24, r27, v0
   200  	mulhdu	r8, r27, v0
   201  	mulld	r9, r20, v0
   202  	mulhdu	r10, r20, v0
   203  	ld	r28, -16(rp)
   204  	ld	r29, -8(rp)
   205  	ld	r30, 0(rp)
   206  	addc	r24, r24, r31
   207  	adde	r9, r9, r8
   208  	addze	r12, r10
   209  	addc	r0, r0, r28
   210  	std	r0, -16(rp)
   211  	adde	r24, r24, r29
   212  	std	r24, -8(rp)
   213  	adde	r9, r9, r30
   214  	std	r9, 0(rp)
   215  	bdz	L(end_3)
   216  
   217  	ALIGN(32)		C registers dying
   218  L(lo_3):
   219  	ld	r26, 8(up)
   220  	ld	r27, 16(up)
   221  	ld	r20, 24(up)	C
   222  	ld	r21, 32(up)	C
   223  	addi	up, up, 32	C
   224  	addi	rp, rp, 32	C
   225  	mulld	r0, r26, v0	C
   226  	mulhdu	r10, r26, v0	C 26
   227  	mulld	r24, r27, v0	C
   228  	mulhdu	r8, r27, v0	C 27
   229  	mulld	r9, r20, v0	C
   230  	mulhdu	r27, r20, v0	C 26
   231  	mulld	r11, r21, v0	C
   232  	mulhdu	r26, r21, v0	C 27
   233  	ld	r28, -24(rp)	C
   234  	adde	r0, r0, r12	C 0 12
   235  	ld	r29, -16(rp)	C
   236  	adde	r24, r24, r10	C 24 10
   237  	ld	r30, -8(rp)	C
   238  	ld	r31, 0(rp)	C
   239  	adde	r9, r9, r8	C 8 9
   240  	adde	r11, r11, r27	C 27 11
   241  	addze	r12, r26	C 26
   242  	addc	r0, r0, r28	C 0 28
   243  	std	r0, -24(rp)	C 0
   244  	adde	r24, r24, r29	C 7 29
   245  	std	r24, -16(rp)	C 7
   246  	adde	r9, r9, r30	C 9 30
   247  	std	r9, -8(rp)	C 9
   248  	adde	r11, r11, r31	C 11 31
   249  	std	r11, 0(rp)	C 11
   250  	bdnz	L(lo_3)		C
   251  
   252  	ALIGN(16)
   253  L(end_3):
   254  	addze	r12, r12
   255  	addic.	vn, vn, -1
   256  	std	r12, 8(rp)
   257  	bne	L(outer_lo_3)
   258  	b	L(ret)
   259  
   260  
   261  	ALIGN(16)
   262  L(b1):
   263  	mulld	r0, r26, v0
   264  	mulhdu	r12, r26, v0
   265  	addic	r0, r0, 0
   266  	std	r0, 0(rp)
   267  	bdz	L(end_m_1)
   268  
   269  	ALIGN(16)
   270  L(lo_m_1):
   271  	ld	r26, 8(up)
   272  	ld	r27, 16(up)
   273  	ld	r20, 24(up)
   274  	ld	r21, 32(up)
   275  	mulld	r0, r26, v0
   276  	mulhdu	r31, r26, v0
   277  	mulld	r24, r27, v0
   278  	mulhdu	r8, r27, v0
   279  	mulld	r9, r20, v0
   280  	mulhdu	r27, r20, v0
   281  	mulld	r11, r21, v0
   282  	mulhdu	r26, r21, v0
   283  	adde	r0, r0, r12
   284  	adde	r24, r24, r31
   285  	std	r0, 8(rp)
   286  	adde	r9, r9, r8
   287  	std	r24, 16(rp)
   288  	adde	r11, r11, r27
   289  	std	r9, 24(rp)
   290  	addi	up, up, 32
   291  	std	r11, 32(rp)
   292  	addi	rp, rp, 32
   293  	mr	r12, r26
   294  	bdnz	L(lo_m_1)
   295  
   296  	ALIGN(16)
   297  L(end_m_1):
   298  	addze	r12, r12
   299  	addic.	vn, vn, -1
   300  	std	r12, 8(rp)
   301  	beq	L(ret)
   302  
   303  	ALIGN(16)
   304  L(outer_lo_1):
   305  	mtctr	un		C copy inner loop count into ctr
   306  	addi	rp, outer_rp, 8
   307  	mr	up, outer_up
   308  	addi	outer_rp, outer_rp, 8
   309  	ld	v0, 0(vp)	C new v limb
   310  	addi	vp, vp, 8
   311  	ld	r26, 0(up)
   312  	ld	r28, 0(rp)
   313  	mulld	r0, r26, v0
   314  	mulhdu	r12, r26, v0
   315  	addc	r0, r0, r28
   316  	std	r0, 0(rp)
   317  	bdz	L(end_1)
   318  
   319  	ALIGN(32)		C registers dying
   320  L(lo_1):
   321  	ld	r26, 8(up)
   322  	ld	r27, 16(up)
   323  	ld	r20, 24(up)	C
   324  	ld	r21, 32(up)	C
   325  	addi	up, up, 32	C
   326  	addi	rp, rp, 32	C
   327  	mulld	r0, r26, v0	C
   328  	mulhdu	r10, r26, v0	C 26
   329  	mulld	r24, r27, v0	C
   330  	mulhdu	r8, r27, v0	C 27
   331  	mulld	r9, r20, v0	C
   332  	mulhdu	r27, r20, v0	C 26
   333  	mulld	r11, r21, v0	C
   334  	mulhdu	r26, r21, v0	C 27
   335  	ld	r28, -24(rp)	C
   336  	adde	r0, r0, r12	C 0 12
   337  	ld	r29, -16(rp)	C
   338  	adde	r24, r24, r10	C 24 10
   339  	ld	r30, -8(rp)	C
   340  	ld	r31, 0(rp)	C
   341  	adde	r9, r9, r8	C 8 9
   342  	adde	r11, r11, r27	C 27 11
   343  	addze	r12, r26	C 26
   344  	addc	r0, r0, r28	C 0 28
   345  	std	r0, -24(rp)	C 0
   346  	adde	r24, r24, r29	C 7 29
   347  	std	r24, -16(rp)	C 7
   348  	adde	r9, r9, r30	C 9 30
   349  	std	r9, -8(rp)	C 9
   350  	adde	r11, r11, r31	C 11 31
   351  	std	r11, 0(rp)	C 11
   352  	bdnz	L(lo_1)		C
   353  
   354  	ALIGN(16)
   355  L(end_1):
   356  	addze	r12, r12
   357  	addic.	vn, vn, -1
   358  	std	r12, 8(rp)
   359  	bne	L(outer_lo_1)
   360  	b	L(ret)
   361  
   362  
   363  	ALIGN(16)
   364  L(b0):
   365  	addi	up, up, -8
   366  	addi	rp, rp, -8
   367  	li	r12, 0
   368  	addic	r12, r12, 0
   369  	bdz	L(end_m_0)
   370  
   371  	ALIGN(16)
   372  L(lo_m_0):
   373  	ld	r26, 8(up)
   374  	ld	r27, 16(up)
   375  	ld	r20, 24(up)
   376  	ld	r21, 32(up)
   377  	mulld	r0, r26, v0
   378  	mulhdu	r31, r26, v0
   379  	mulld	r24, r27, v0
   380  	mulhdu	r8, r27, v0
   381  	mulld	r9, r20, v0
   382  	mulhdu	r27, r20, v0
   383  	mulld	r11, r21, v0
   384  	mulhdu	r26, r21, v0
   385  	adde	r0, r0, r12
   386  	adde	r24, r24, r31
   387  	std	r0, 8(rp)
   388  	adde	r9, r9, r8
   389  	std	r24, 16(rp)
   390  	adde	r11, r11, r27
   391  	std	r9, 24(rp)
   392  	addi	up, up, 32
   393  	std	r11, 32(rp)
   394  	addi	rp, rp, 32
   395  	mr	r12, r26
   396  	bdnz	L(lo_m_0)
   397  
   398  	ALIGN(16)
   399  L(end_m_0):
   400  	addze	r12, r12
   401  	addic.	vn, vn, -1
   402  	std	r12, 8(rp)
   403  	beq	L(ret)
   404  
   405  	ALIGN(16)
   406  L(outer_lo_0):
   407  	mtctr	un		C copy inner loop count into ctr
   408  	addi	rp, outer_rp, 0
   409  	addi	up, outer_up, -8
   410  	addi	outer_rp, outer_rp, 8
   411  	ld	v0, 0(vp)	C new v limb
   412  	addi	vp, vp, 8
   413  	li	r12, 0
   414  	addic	r12, r12, 0
   415  	bdz	L(end_0)
   416  
   417  	ALIGN(32)		C registers dying
   418  L(lo_0):
   419  	ld	r26, 8(up)
   420  	ld	r27, 16(up)
   421  	ld	r20, 24(up)	C
   422  	ld	r21, 32(up)	C
   423  	addi	up, up, 32	C
   424  	addi	rp, rp, 32	C
   425  	mulld	r0, r26, v0	C
   426  	mulhdu	r10, r26, v0	C 26
   427  	mulld	r24, r27, v0	C
   428  	mulhdu	r8, r27, v0	C 27
   429  	mulld	r9, r20, v0	C
   430  	mulhdu	r27, r20, v0	C 26
   431  	mulld	r11, r21, v0	C
   432  	mulhdu	r26, r21, v0	C 27
   433  	ld	r28, -24(rp)	C
   434  	adde	r0, r0, r12	C 0 12
   435  	ld	r29, -16(rp)	C
   436  	adde	r24, r24, r10	C 24 10
   437  	ld	r30, -8(rp)	C
   438  	ld	r31, 0(rp)	C
   439  	adde	r9, r9, r8	C 8 9
   440  	adde	r11, r11, r27	C 27 11
   441  	addze	r12, r26	C 26
   442  	addc	r0, r0, r28	C 0 28
   443  	std	r0, -24(rp)	C 0
   444  	adde	r24, r24, r29	C 7 29
   445  	std	r24, -16(rp)	C 7
   446  	adde	r9, r9, r30	C 9 30
   447  	std	r9, -8(rp)	C 9
   448  	adde	r11, r11, r31	C 11 31
   449  	std	r11, 0(rp)	C 11
   450  	bdnz	L(lo_0)		C
   451  
   452  	ALIGN(16)
   453  L(end_0):
   454  	addze	r12, r12
   455  	addic.	vn, vn, -1
   456  	std	r12, 8(rp)
   457  	bne	L(outer_lo_0)
   458  	b	L(ret)
   459  
   460  
   461  	ALIGN(16)
   462  L(b2):	ld	r27, 8(up)
   463  	addi	up, up, 8
   464  	mulld	r0, r26, v0
   465  	mulhdu	r10, r26, v0
   466  	mulld	r24, r27, v0
   467  	mulhdu	r8, r27, v0
   468  	addc	r24, r24, r10
   469  	addze	r12, r8
   470  	std	r0, 0(rp)
   471  	std	r24, 8(rp)
   472  	addi	rp, rp, 8
   473  	bdz	L(end_m_2)
   474  
   475  	ALIGN(16)
   476  L(lo_m_2):
   477  	ld	r26, 8(up)
   478  	ld	r27, 16(up)
   479  	ld	r20, 24(up)
   480  	ld	r21, 32(up)
   481  	mulld	r0, r26, v0
   482  	mulhdu	r31, r26, v0
   483  	mulld	r24, r27, v0
   484  	mulhdu	r8, r27, v0
   485  	mulld	r9, r20, v0
   486  	mulhdu	r27, r20, v0
   487  	mulld	r11, r21, v0
   488  	mulhdu	r26, r21, v0
   489  	adde	r0, r0, r12
   490  	adde	r24, r24, r31
   491  	std	r0, 8(rp)
   492  	adde	r9, r9, r8
   493  	std	r24, 16(rp)
   494  	adde	r11, r11, r27
   495  	std	r9, 24(rp)
   496  	addi	up, up, 32
   497  	std	r11, 32(rp)
   498  	addi	rp, rp, 32
   499  	mr	r12, r26
   500  	bdnz	L(lo_m_2)
   501  
   502  	ALIGN(16)
   503  L(end_m_2):
   504  	addze	r12, r12
   505  	addic.	vn, vn, -1
   506  	std	r12, 8(rp)
   507  	beq	L(ret)
   508  
   509  	ALIGN(16)
   510  L(outer_lo_2):
   511  	mtctr	un		C copy inner loop count into ctr
   512  	addi	rp, outer_rp, 16
   513  	addi	up, outer_up, 8
   514  	addi	outer_rp, outer_rp, 8
   515  	ld	v0, 0(vp)	C new v limb
   516  	addi	vp, vp, 8
   517  	ld	r26, -8(up)
   518  	ld	r27, 0(up)
   519  	ld	r28, -8(rp)
   520  	ld	r29, 0(rp)
   521  	mulld	r0, r26, v0
   522  	mulhdu	r10, r26, v0
   523  	mulld	r24, r27, v0
   524  	mulhdu	r8, r27, v0
   525  	addc	r24, r24, r10
   526  	addze	r12, r8
   527  	addc	r0, r0, r28
   528  	std	r0, -8(rp)
   529  	adde	r24, r24, r29
   530  	std	r24, 0(rp)
   531  	bdz	L(end_2)
   532  
   533  	ALIGN(16)		C registers dying
   534  L(lo_2):
   535  	ld	r26, 8(up)
   536  	ld	r27, 16(up)
   537  	ld	r20, 24(up)	C
   538  	ld	r21, 32(up)	C
   539  	addi	up, up, 32	C
   540  	addi	rp, rp, 32	C
   541  	mulld	r0, r26, v0	C
   542  	mulhdu	r10, r26, v0	C 26
   543  	mulld	r24, r27, v0	C
   544  	mulhdu	r8, r27, v0	C 27
   545  	mulld	r9, r20, v0	C
   546  	mulhdu	r27, r20, v0	C 26
   547  	mulld	r11, r21, v0	C
   548  	mulhdu	r26, r21, v0	C 27
   549  	ld	r28, -24(rp)	C
   550  	adde	r0, r0, r12	C 0 12
   551  	ld	r29, -16(rp)	C
   552  	adde	r24, r24, r10	C 24 10
   553  	ld	r30, -8(rp)	C
   554  	ld	r31, 0(rp)	C
   555  	adde	r9, r9, r8	C 8 9
   556  	adde	r11, r11, r27	C 27 11
   557  	addze	r12, r26	C 26
   558  	addc	r0, r0, r28	C 0 28
   559  	std	r0, -24(rp)	C 0
   560  	adde	r24, r24, r29	C 7 29
   561  	std	r24, -16(rp)	C 7
   562  	adde	r9, r9, r30	C 9 30
   563  	std	r9, -8(rp)	C 9
   564  	adde	r11, r11, r31	C 11 31
   565  	std	r11, 0(rp)	C 11
   566  	bdnz	L(lo_2)		C
   567  
   568  	ALIGN(16)
   569  L(end_2):
   570  	addze	r12, r12
   571  	addic.	vn, vn, -1
   572  	std	r12, 8(rp)
   573  	bne	L(outer_lo_2)
   574  C	b	L(ret)
   575  
   576  L(ret):	ld	r31, -8(r1)
   577  	ld	r30, -16(r1)
   578  	ld	r29, -24(r1)
   579  	ld	r28, -32(r1)
   580  	ld	r27, -40(r1)
   581  	ld	r26, -48(r1)
   582  	ld	r25, -56(r1)
   583  	ld	r24, -64(r1)
   584  	ld	r23, -72(r1)
   585  	ld	r22, -80(r1)
   586  	ld	r21, -88(r1)
   587  	ld	r20, -96(r1)
   588  	blr
   589  EPILOGUE()