github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/mul_1.asm (about)

     1  dnl  PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
     2  dnl  the result in a second limb vector.
     3  
     4  dnl  Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C               cycles/limb
    35  C POWER3/PPC630     6-18
    36  C POWER4/PPC970     7.25?  not updated for last file revision
    37  C POWER5            7.25
    38  C POWER6           14
    39  C POWER7            2.9
    40  
    41  C TODO
    42  C  * Try to reduce the number of needed live registers (at least r5 and r10
    43  C    could be combined)
    44  C  * Optimize feed-in code, for speed and size.
    45  C  * Clean up r12/r7 usage in feed-in code.
    46  
    47  C INPUT PARAMETERS
    48  define(`rp', `r3')
    49  define(`up', `r4')
    50  define(`n', `r5')
    51  define(`vl', `r6')
    52  
    53  ASM_START()
    54  PROLOGUE(mpn_mul_1c)
    55  	std	r27, -40(r1)
    56  	std	r26, -48(r1)
    57  	mr	r12, r7
    58  	b	L(ent)
    59  EPILOGUE()
    60  PROLOGUE(mpn_mul_1)
    61  	std	r27, -40(r1)
    62  	std	r26, -48(r1)
    63  	li	r12, 0		C cy_limb = 0
    64  L(ent):	ld	r26, 0(up)
    65  
    66  	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
    67  	cmpdi	cr6, r0, 2
    68  	addic	n, n, 3		C compute count...
    69  	srdi	n, n, 2		C ...for ctr
    70  	mtctr	n		C copy count into ctr
    71  	beq	cr0, L(b00)
    72  	blt	cr6, L(b01)
    73  	beq	cr6, L(b10)
    74  
    75  L(b11):	mr	r7, r12
    76  	mulld	r0, r26, r6
    77  	mulhdu	r12, r26, r6
    78  	addi	up, up, 8
    79  	addc	r0, r0, r7
    80  	std	r0, 0(rp)
    81  	addi	rp, rp, 8
    82  	b	L(fic)
    83  
    84  L(b00):	ld	r27, 8(up)
    85  	addi	up, up, 16
    86  	mulld	r0, r26, r6
    87  	mulhdu	r5, r26, r6
    88  	mulld	r7, r27, r6
    89  	mulhdu	r8, r27, r6
    90  	addc	r0, r0, r12
    91  	adde	r7, r7, r5
    92  	addze	r12, r8
    93  	std	r0, 0(rp)
    94  	std	r7, 8(rp)
    95  	addi	rp, rp, 16
    96  	b	L(fic)
    97  
    98  	nop			C alignment
    99  L(b01):	bdnz	L(gt1)
   100  	mulld	r0, r26, r6
   101  	mulhdu	r8, r26, r6
   102  	addc	r0, r0, r12
   103  	std	r0, 0(rp)
   104  	b	L(ret)
   105  L(gt1):	ld	r27, 8(up)
   106  	nop
   107  	mulld	r0, r26, r6
   108  	mulhdu	r5, r26, r6
   109  	ld	r26, 16(up)
   110  	mulld	r7, r27, r6
   111  	mulhdu	r8, r27, r6
   112  	mulld	r9, r26, r6
   113  	mulhdu	r10, r26, r6
   114  	addc	r0, r0, r12
   115  	adde	r7, r7, r5
   116  	adde	r9, r9, r8
   117  	addze	r12, r10
   118  	std	r0, 0(rp)
   119  	std	r7, 8(rp)
   120  	std	r9, 16(rp)
   121  	addi	up, up, 24
   122  	addi	rp, rp, 24
   123  	b	L(fic)
   124  
   125  	nop
   126  L(fic):	ld	r26, 0(up)
   127  L(b10):	ld	r27, 8(up)
   128  	addi	up, up, 16
   129  	bdz	L(end)
   130  
   131  L(top):	mulld	r0, r26, r6
   132  	mulhdu	r5, r26, r6
   133  	mulld	r7, r27, r6
   134  	mulhdu	r8, r27, r6
   135  	ld	r26, 0(up)
   136  	ld	r27, 8(up)
   137  	adde	r0, r0, r12
   138  	adde	r7, r7, r5
   139  	mulld	r9, r26, r6
   140  	mulhdu	r10, r26, r6
   141  	mulld	r11, r27, r6
   142  	mulhdu	r12, r27, r6
   143  	ld	r26, 16(up)
   144  	ld	r27, 24(up)
   145  	std	r0, 0(rp)
   146  	adde	r9, r9, r8
   147  	std	r7, 8(rp)
   148  	adde	r11, r11, r10
   149  	std	r9, 16(rp)
   150  	addi	up, up, 32
   151  	std	r11, 24(rp)
   152  
   153  	addi	rp, rp, 32
   154  	bdnz	L(top)
   155  
   156  L(end):	mulld	r0, r26, r6
   157  	mulhdu	r5, r26, r6
   158  	mulld	r7, r27, r6
   159  	mulhdu	r8, r27, r6
   160  	adde	r0, r0, r12
   161  	adde	r7, r7, r5
   162  	std	r0, 0(rp)
   163  	std	r7, 8(rp)
   164  L(ret):	addze	r3, r8
   165  	ld	r27, -40(r1)
   166  	ld	r26, -48(r1)
   167  	blr
   168  EPILOGUE()