github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/mod_34lsub1.asm (about)

     1  dnl  PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
     2  
     3  dnl  Copyright 2002, 2003, 2005 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C                cycles/limb
    35  C 603e:            ?
    36  C 604e:            3
    37  C 75x (G3):        3
    38  C 7400,7410 (G4):  3
    39  C 744x,745x (G4+): 3
    40  C power4/ppc970:   2.5
    41  C power5:          2.5
    42  
    43  C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
    44  C
    45  C There seems no need to schedule the loads back, the code is still 3.0 c/l
    46  C on 750/7400 no matter where they're placed.
    47  C
    48  C Alternatives:
    49  C
    50  C Fetching half words would allow add instead for accumulating, instead of
    51  C adde and its serialization.  An outer loop would be required though, since
    52  C 2^16 halfwords can overflow.  lhz+add would be 2.0 c/l, but if there's
    53  C also a bdz or bdnz for each and a pointer update say every three limbs
    54  C then the total would be 2.67 c/l which isn't much faster than the current
    55  C simpler code.
    56  
    57  ASM_START()
    58  PROLOGUE(mpn_mod_34lsub1)
    59  
    60  	C r3	src
    61  	C r4	size
    62  
    63  	mtctr	r4
    64  	addic	r6, r3, 8		C &src[2], and clear CA
    65  
    66  	lwz	r3, 0(r3)		C acc0 = src[0]
    67  	bdz	L(done)
    68  
    69  	lwz	r4, -4(r6)		C acc1 = src[1]
    70  	bdz	L(two)
    71  
    72  	lwz	r5, 0(r6)		C acc2 = src[2]
    73  	lis	r7, 0			C no carry if just three limbs
    74  
    75  	bdz	L(three)
    76  	lis	r7, 1			C 0x10000 carry pos
    77  
    78  L(top):
    79  	C r3	acc0
    80  	C r4	acc1
    81  	C r5	acc2
    82  	C r6	src, incrementing
    83  	C r7	carry pos
    84  
    85  	lwz	r0, 4(r6)
    86  	adde	r3, r3, r0
    87  	bdz	L(end0)
    88  
    89  	lwz	r0, 8(r6)
    90  	adde	r4, r4, r0
    91  	bdz	L(end1)
    92  
    93  	lwzu	r0, 12(r6)
    94  	adde	r5, r5, r0
    95  	bdnz	L(top)
    96  
    97  
    98  	srwi	r7, r7, 8
    99  L(end0):
   100  	srwi	r7, r7, 8
   101  L(end1):
   102  	subfe	r0, r0, r0		C -1 if not CA
   103  
   104  	andc	r7, r7, r0		C final carry, 0x10000, 0x100, 1 or 0
   105  L(three):
   106  	rlwinm	r6, r3, 0,8,31		C acc0 low
   107  
   108  	add	r7, r7, r6
   109  	rlwinm	r6, r3, 8,24,31		C acc0 high
   110  
   111  	add	r7, r7, r6
   112  	rlwinm	r6, r4, 8,8,23		C acc1 low
   113  
   114  	add	r7, r7, r6
   115  	rlwinm	r6, r4, 16,16,31	C acc1 high
   116  
   117  	add	r7, r7, r6
   118  	rlwinm	r6, r5, 16,8,15		C acc2 low
   119  
   120  	add	r7, r7, r6
   121  	rlwinm	r6, r5, 24,8,31		C acc2 high
   122  
   123  	add	r3, r7, r6
   124  
   125  L(done):
   126  	blr
   127  
   128  L(two):
   129  	C r3	acc0
   130  	C r4	acc1
   131  
   132  	rlwinm	r5, r3, 8,24,31		C acc0 high
   133  	rlwinm	r3, r3, 0,8,31		C acc0 low
   134  
   135  	add	r3, r3, r5		C acc0 high + low
   136  	rlwinm	r5, r4, 16,16,31	C acc1 high
   137  
   138  	add	r3, r3, r5		C add acc1 high
   139  	rlwinm	r5, r4, 8,8,23		C acc1 low
   140  
   141  	add	r3, r3, r5		C add acc1 low
   142  
   143  	blr
   144  
   145  EPILOGUE()