github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/addmul_1.asm (about)

     1  dnl  PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
     2  dnl  result to a second limb vector.
     3  
     4  dnl  Copyright 1995, 1997, 1998, 2000-2003, 2005 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C                cycles/limb
    35  C 603e:            ?
    36  C 604e:            6.75
    37  C 75x (G3):        8.7-14.3
    38  C 7400,7410 (G4):  8.7-14.3
    39  C 744x,745x (G4+): 9.5
    40  C power4/ppc970:   6.25
    41  C power5:          6.25
    42  
    43  C INPUT PARAMETERS
    44  C rp	r3
    45  C up	r4
    46  C n	r5
    47  C vl	r6
    48  
    49  C This is optimized for the PPC604.  It has not been tuned for other
    50  C PowerPC processors.
    51  C
    52  C Loop Analysis for the 604:
    53  C 12 mem insn
    54  C 8 serializing insn
    55  C 8 int multiply
    56  C 25 int reg write
    57  C 9 int ops (8 of which serialize)
    58  C
    59  C The multiply insns need 16 cycles/4limb.
    60  C The integer register writes will need 13 cycles/4limb.
    61  C All-in-all, it should be possible to get to 4 or 5 cycles/limb on PPC604,
    62  C but that will require some clever FPNOPS and BNOPS for exact
    63  C issue control.
    64  
    65  
    66  ASM_START()
    67  PROLOGUE(mpn_addmul_1)
    68  	cmpwi	cr0,r5,9	C more than 9 limbs?
    69  	bgt	cr0,L(big)	C branch if more than 9 limbs
    70  
    71  	mtctr	r5
    72  	lwz	r0,0(r4)
    73  	mullw	r7,r0,r6
    74  	mulhwu	r10,r0,r6
    75  	lwz	r9,0(r3)
    76  	addc	r8,r7,r9
    77  	addi	r3,r3,-4
    78  	bdz	L(end)
    79  L(loop):
    80  	lwzu	r0,4(r4)
    81  	stwu	r8,4(r3)
    82  	mullw	r8,r0,r6
    83  	adde	r7,r8,r10
    84  	mulhwu	r10,r0,r6
    85  	lwz	r9,4(r3)
    86  	addze	r10,r10
    87  	addc	r8,r7,r9
    88  	bdnz	L(loop)
    89  L(end):	stw	r8,4(r3)
    90  	addze	r3,r10
    91  	blr
    92  
    93  L(big):	stwu	r1,-16(r1)
    94  	addi	r5,r5,-1
    95  	stw	r30,8(r1)
    96  	srwi	r0,r5,2
    97  	stw	r31,12(r1)
    98  	mtctr	r0
    99  
   100  	lwz	r7,0(r4)
   101  	mullw	r8,r7,r6
   102  	mulhwu	r0,r7,r6
   103  	lwz	r7,0(r3)
   104  	addc	r8,r8,r7
   105  	stw	r8,0(r3)
   106  
   107  L(loopU):
   108  	lwz	r7,4(r4)
   109  	lwz	r12,8(r4)
   110  	lwz	r30,12(r4)
   111  	lwzu	r31,16(r4)
   112  	mullw	r8,r7,r6
   113  	mullw	r9,r12,r6
   114  	mullw	r10,r30,r6
   115  	mullw	r11,r31,r6
   116  	adde	r8,r8,r0	C add cy_limb
   117  	mulhwu	r0,r7,r6
   118  	lwz	r7,4(r3)
   119  	adde	r9,r9,r0
   120  	mulhwu	r0,r12,r6
   121  	lwz	r12,8(r3)
   122  	adde	r10,r10,r0
   123  	mulhwu	r0,r30,r6
   124  	lwz	r30,12(r3)
   125  	adde	r11,r11,r0
   126  	mulhwu	r0,r31,r6
   127  	lwz	r31,16(r3)
   128  	addze	r0,r0		C new cy_limb
   129  	addc	r8,r8,r7
   130  	stw	r8,4(r3)
   131  	adde	r9,r9,r12
   132  	stw	r9,8(r3)
   133  	adde	r10,r10,r30
   134  	stw	r10,12(r3)
   135  	adde	r11,r11,r31
   136  	stwu	r11,16(r3)
   137  	bdnz	L(loopU)
   138  
   139  	andi.	r31,r5,3
   140  	mtctr	r31
   141  	beq	cr0,L(endx)
   142  
   143  L(loopE):
   144  	lwzu	r7,4(r4)
   145  	mullw	r8,r7,r6
   146  	adde	r8,r8,r0	C add cy_limb
   147  	mulhwu	r0,r7,r6
   148  	lwz	r7,4(r3)
   149  	addze	r0,r0		C new cy_limb
   150  	addc	r8,r8,r7
   151  	stwu	r8,4(r3)
   152  	bdnz	L(loopE)
   153  L(endx):
   154  	addze	r3,r0
   155  	lwz	r30,8(r1)
   156  	lwz	r31,12(r1)
   157  	addi	r1,r1,16
   158  	blr
   159  EPILOGUE(mpn_addmul_1)