github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/mode1o.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/mode1o.asm (about)

     1  dnl  PowerPC-32 mpn_modexact_1_odd -- mpn by limb exact remainder.
     2  
     3  dnl  Copyright 2002, 2003, 2005, 2006 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C                cycles/limb
    35  C 603e:             ?
    36  C 604e:             6.0
    37  C 75x (G3):         6.0-13.0, depending on divisor
    38  C 7400,7410 (G4):   6.0-13.0, depending on divisor
    39  C 744x,745x (G4+):  8.0-10.0, depending on divisor
    40  C power4/ppc970:   12.0
    41  C power5:          12.0
    42  
    43  
    44  C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
    45  C                               mp_limb_t divisor);
    46  C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
    47  C                                mp_limb_t divisor, mp_limb_t carry);
    48  C
    49  C For PIC, the inverse is established arithmetically since it measures about
    50  C 5 cycles faster than the nonsense needed to access binvert_limb_table in
    51  C SVR4 or Darwin style PIC.  AIX might be better, since it avoids bl/mflr to
    52  C get at the GOT/TOC/whatever.
    53  C
    54  C Using divwu for size==1 measured about 10 cycles slower on 604e, or about
    55  C 3-5 cycles faster on 750.  For now it doesn't seem worth bothering with.
    56  C
    57  C The loop allows an early-out on mullw for the inverse, and on mulhwu for
    58  C the divisor.  So the fastest is for instance divisor==1 (inverse==-1), and
    59  C the slowest is anything giving a full 32-bits in both, such as
    60  C divisor==0xDEADBEEF (inverse==0x904B300F).  These establish the stated
    61  C range above for 750 and 7400.
    62  
    63  
    64  ASM_START()
    65  
    66  EXTERN(binvert_limb_table)
    67  
    68  PROLOGUE(mpn_modexact_1_odd)
    69  	li	r6, 0
    70  
    71  PROLOGUE(mpn_modexact_1c_odd)
    72  
    73  	mtctr	r4			C size
    74  
    75  ifdef(`PIC_SLOW',`
    76  C Load from our table with PIC is so slow on Linux and Darwin that we avoid it
    77  	rlwinm	r7, r5, 1,28,28		C (divisor << 1) & 8
    78  	rlwinm	r8, r5, 2,28,28		C (divisor << 2) & 8
    79  	xor	r7, r7, r8		C ((divisor << 1) ^ (divisor << 2)) & 8
    80  	rlwinm	r4, r5, 0,28,31		C divisor low 4 bits, speedup mullw
    81  	xor	r4, r4, r7		C inverse, 4 bits
    82  	mullw	r7, r4, r4		C i*i
    83  	slwi	r4, r4, 1		C 2*i
    84  	rlwinm	r8, r5, 0,24,31		C divisor low 8 bits, speedup mullw
    85  	mullw	r7, r7, r8		C i*i*d
    86  	sub	r4, r4, r7		C inverse, 8 bits
    87  ',`
    88  	LEA(	r7, binvert_limb_table)
    89  	rlwinm	r4, r5, 31,25,31	C (divisor/2) & 0x7F
    90  	lbzx	r4, r4,r7		C inverse, 8 bits
    91  ')
    92  
    93  	mullw	r7, r4, r4		C i*i
    94  	slwi	r4, r4, 1		C 2*i
    95  	mullw	r7, r5, r7		C i*i*d   [i*i is 16 bits, so second operand]
    96  	sub	r4, r4, r7		C inverse, 16 bits
    97  	mullw	r7, r4, r4		C i*i
    98  	slwi	r4, r4, 1		C 2*i
    99  	mullw	r7, r7, r5		C i*i*d
   100  	lwz	r0, 0(r3)		C src[0]
   101  	sub	r4, r4, r7		C inverse, 32 bits
   102  	subfc	r7, r6, r0		C l = src[0] - carry
   103  
   104  	mullw	r7, r7, r4		C q = l * inverse
   105  	bdz	L(one)
   106  
   107  	lwzu	r0, 4(r3)		C src[1]
   108  	mulhwu	r6, r7, r5		C carry = high(q*divisor)
   109  	subfe	r7, r6, r0		C l = src[1] - carry
   110  	bdz	L(two)
   111  
   112  L(top):
   113  	mullw	r7, r7, r4		C q = l * inverse
   114  	lwzu	r0, 4(r3)		C src[i]
   115  	mulhwu	r6, r7, r5		C carry = high(q*divisor)
   116  	subfe	r7, r6, r0		C l = src[i] - carry
   117  	bdnz	L(top)
   118  
   119  L(two):	mullw	r7, r7, r4		C q = l * inverse
   120  L(one):	subfe	r3, r3, r3		C ca 0 or -1
   121  	mulhwu	r6, r7, r5		C carry = high(q*divisor)
   122  	subf	r3, r3, r6		C carry + ca
   123  	blr
   124  
   125  EPILOGUE(mpn_modexact_1c_odd)
   126  EPILOGUE(mpn_modexact_1_odd)
   127  ASM_END()