github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/p6/aorsmul_1.asm (about)

     1  dnl  PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
     2  
     3  dnl  Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
     4  dnl  Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C               mpn_addmul_1    mpn_submul_1
    35  C               cycles/limb     cycles/limb
    36  C POWER3/PPC630     ?               ?
    37  C POWER4/PPC970     ?               ?
    38  C POWER5            ?               ?
    39  C POWER6           12.25           12.8
    40  C POWER7            ?               ?
    41  
    42  C TODO
    43  C  * Reduce register usage.
    44  C  * Schedule function entry code.
    45  C  * Unroll more.  8-way unrolling would bring us to 10 c/l, 16-way unrolling
    46  C    would bring us to 9 c/l.
    47  C  * Handle n = 1 and perhaps n = 2 separately, without saving any registers.
    48  
    49  C INPUT PARAMETERS
    50  define(`rp',  `r3')
    51  define(`up',  `r4')
    52  define(`n',   `r5')
    53  define(`v0',  `r6')
    54  
    55  ifdef(`OPERATION_addmul_1',`
    56    define(ADDSUBC,	adde)
    57    define(ADDSUB,	addc)
    58    define(func,		mpn_addmul_1)
    59    define(func_nc,	mpn_addmul_1c)	C FIXME: not really supported
    60    define(AM,		`$1')
    61    define(SM,		`')
    62    define(CLRRSC,	`addic	$1, r0, 0')
    63  ')
    64  ifdef(`OPERATION_submul_1',`
    65    define(ADDSUBC,	subfe)
    66    define(ADDSUB,	subfc)
    67    define(func,		mpn_submul_1)
    68    define(func_nc,	mpn_submul_1c)	C FIXME: not really supported
    69    define(AM,		`')
    70    define(SM,		`$1')
    71    define(CLRRSC,	`subfc	$1, r0, r0')
    72  ')
    73  
    74  MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
    75  
    76  ASM_START()
    77  PROLOGUE(func)
    78  	std	r31, -8(r1)
    79  	std	r30, -16(r1)
    80  	std	r29, -24(r1)
    81  	std	r28, -32(r1)
    82  	std	r27, -40(r1)
    83  
    84  	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
    85  	cmpdi	cr6, r0, 2
    86  	addi	n, n, 3		C compute count...
    87  	srdi	n, n, 2		C ...for ctr
    88  	mtctr	n		C copy loop count into ctr
    89  	beq	cr0, L(b0)
    90  	blt	cr6, L(b1)
    91  	beq	cr6, L(b2)
    92  
    93  L(b3):	ld	r8, 0(up)
    94  	ld	r7, 8(up)
    95  	ld	r27, 16(up)
    96  	addi	up, up, 16
    97  	addi	rp, rp, 16
    98  	mulld	r5,  r8, v0
    99  	mulhdu	r8,  r8, v0
   100  	mulld	r9,  r7, v0
   101  	mulhdu	r7,  r7, v0
   102  	mulld	r11, r27, v0
   103  	mulhdu	r27, r27, v0
   104  	ld	r29, -16(rp)
   105  	ld	r30, -8(rp)
   106  	ld	r31, 0(rp)
   107  	addc	r9, r9, r8
   108  	adde	r11, r11, r7
   109  	addze	r12, r27
   110  	ADDSUB	r5, r5, r29
   111  	b	L(l3)
   112  
   113  L(b2):	ld	r7, 0(up)
   114  	ld	r27, 8(up)
   115  	addi	up, up, 8
   116  	addi	rp, rp, 8
   117  	mulld	r9,  r7, v0
   118  	mulhdu	r7,  r7, v0
   119  	mulld	r11, r27, v0
   120  	mulhdu	r27, r27, v0
   121  	ld	r30, -8(rp)
   122  	ld	r31, 0(rp)
   123  	addc	r11, r11, r7
   124  	addze	r12, r27
   125  	ADDSUB	r9, r9, r30
   126  	b	L(l2)
   127  
   128  L(b1):	ld	r27, 0(up)
   129  	ld	r31, 0(rp)
   130  	mulld	r11, r27, v0
   131  	mulhdu	r12, r27, v0
   132  	ADDSUB	r11, r11, r31
   133  	b	L(l1)
   134  
   135  L(b0):	addi	up, up, -8
   136  	addi	rp, rp, -8
   137  	CLRRSC(	r12)		C clear r12 and clr/set cy
   138  
   139  	ALIGN(32)
   140  L(top):
   141  SM(`	subfe	r11, r0, r0')	C complement...
   142  SM(`	addic	r11, r11, 1')	C ...carry flag
   143  	ld	r10, 8(up)
   144  	ld	r8, 16(up)
   145  	ld	r7, 24(up)
   146  	ld	r27, 32(up)
   147  	addi	up, up, 32
   148  	addi	rp, rp, 32
   149  	mulld	r0,  r10, v0
   150  	mulhdu	r10, r10, v0
   151  	mulld	r5,  r8, v0
   152  	mulhdu	r8,  r8, v0
   153  	mulld	r9,  r7, v0
   154  	mulhdu	r7,  r7, v0
   155  	mulld	r11, r27, v0
   156  	mulhdu	r27, r27, v0
   157  	ld	r28, -24(rp)
   158  	adde	r0, r0, r12
   159  	ld	r29, -16(rp)
   160  	adde	r5, r5, r10
   161  	ld	r30, -8(rp)
   162  	ld	r31, 0(rp)
   163  	adde	r9, r9, r8
   164  	adde	r11, r11, r7
   165  	addze	r12, r27
   166  	ADDSUB	r0, r0, r28
   167  	std	r0, -24(rp)
   168  	ADDSUBC	r5, r5, r29
   169  L(l3):	std	r5, -16(rp)
   170  	ADDSUBC	r9, r9, r30
   171  L(l2):	std	r9, -8(rp)
   172  	ADDSUBC	r11, r11, r31
   173  L(l1):	std	r11, 0(rp)
   174  	bdnz	L(top)
   175  
   176  AM(`	addze	r3, r12')
   177  SM(`	subfe	r11, r0, r0')		C complement...
   178  	ld	r31, -8(r1)
   179  SM(`	subf	r3, r11, r12')
   180  	ld	r30, -16(r1)
   181  	ld	r29, -24(r1)
   182  	ld	r28, -32(r1)
   183  	ld	r27, -40(r1)
   184  	blr
   185  EPILOGUE()