github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/mode1o.asm (about)

     1  dnl  Alpha mpn_modexact_1c_odd -- mpn exact remainder
     2  
     3  dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C      cycles/limb
    35  C EV4:    47
    36  C EV5:    30
    37  C EV6:    15
    38  
    39  
    40  C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
    41  C                                mp_limb_t c)
    42  C
    43  C This code follows the "alternate" code in mpn/generic/mode1o.c,
    44  C eliminating cbit+climb from the dependent chain.  This leaves,
    45  C
    46  C        ev4   ev5   ev6
    47  C         1     3     1    subq   y = x - h
    48  C        23    13     7    mulq   q = y * inverse
    49  C        23    14     7    umulh  h = high (q * d)
    50  C        --    --    --
    51  C        47    30    15
    52  C
    53  C In each case, the load latency, loop control, and extra carry bit handling
    54  C hide under the multiply latencies.  Those latencies are long enough that
    55  C we don't need to worry about alignment or pairing to squeeze out
    56  C performance.
    57  C
    58  C For the first limb, some of the loop code is broken out and scheduled back
    59  C since it can be done earlier.
    60  C
    61  C   - The first ldq src[0] is near the start of the routine, for maximum
    62  C     time from memory.
    63  C
    64  C   - The subq y=x-climb can be done without waiting for the inverse.
    65  C
    66  C   - The mulq y*inverse is replicated after the final subq for the inverse,
    67  C     instead of branching to the mulq in the main loop.  On ev4 a branch
    68  C     there would cost cycles, but we can hide them under the mulq latency.
    69  C
    70  C For the last limb, high<divisor is tested and if that's true a subtract
    71  C and addback is done, as per the main mpn/generic/mode1o.c code.  This is a
    72  C data-dependent branch, but we're waiting for umulh so any penalty should
    73  C hide there.  The multiplies saved would be worth the cost anyway.
    74  C
    75  C Enhancements:
    76  C
    77  C For size==1, a plain division (done bitwise say) might be faster than
    78  C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
    79  C ev5.  A call to gcc __remqu might be a possibility.
    80  
    81  ASM_START()
    82  PROLOGUE(mpn_modexact_1c_odd,gp)
    83  
    84  	C r16	src
    85  	C r17	size
    86  	C r18	d
    87  	C r19	c
    88  
    89  	LEA(r0, binvert_limb_table)
    90  	srl	r18, 1, r20		C d >> 1
    91  
    92  	and	r20, 127, r20		C idx = d>>1 & 0x7F
    93  
    94  	addq	r0, r20, r21		C table + idx
    95  
    96  ifelse(bwx_available_p,1,
    97  `	ldbu	r20, 0(r21)		C table[idx], inverse 8 bits
    98  ',`
    99  	ldq_u	r20, 0(r21)		C table[idx] qword
   100  	extbl	r20, r21, r20		C table[idx], inverse 8 bits
   101  ')
   102  
   103  	mull	r20, r20, r7		C i*i
   104  	addq	r20, r20, r20		C 2*i
   105  
   106  	ldq	r2, 0(r16)		C x = s = src[0]
   107  	lda	r17, -1(r17)		C size--
   108  	clr	r0			C initial cbit=0
   109  
   110  	mull	r7, r18, r7		C i*i*d
   111  
   112  	subq	r20, r7, r20		C 2*i-i*i*d, inverse 16 bits
   113  
   114  	mull	r20, r20, r7		C i*i
   115  	addq	r20, r20, r20		C 2*i
   116  
   117  	mull	r7, r18, r7		C i*i*d
   118  
   119  	subq	r20, r7, r20		C 2*i-i*i*d, inverse 32 bits
   120  
   121  	mulq	r20, r20, r7		C i*i
   122  	addq	r20, r20, r20		C 2*i
   123  
   124  	mulq	r7, r18, r7		C i*i*d
   125  	subq	r2, r19, r3		C y = x - climb
   126  
   127  	subq	r20, r7, r20		C inv = 2*i-i*i*d, inverse 64 bits
   128  
   129  ASSERT(r7, C should have d*inv==1 mod 2^64
   130  `	mulq	r18, r20, r7
   131  	cmpeq	r7, 1, r7')
   132  
   133  	mulq	r3, r20, r4		C first q = y * inv
   134  
   135  	beq	r17, L(one)		C if size==1
   136  	br	L(entry)
   137  
   138  
   139  L(top):
   140  	C r0	cbit
   141  	C r16	src, incrementing
   142  	C r17	size, decrementing
   143  	C r18	d
   144  	C r19	climb
   145  	C r20	inv
   146  
   147  	ldq	r1, 0(r16)		C s = src[i]
   148  	subq	r1, r0, r2		C x = s - cbit
   149  	cmpult	r1, r0, r0		C new cbit = s < cbit
   150  
   151  	subq	r2, r19, r3		C y = x - climb
   152  
   153  	mulq	r3, r20, r4		C q = y * inv
   154  L(entry):
   155  	cmpult	r2, r19, r5		C cbit2 = x < climb
   156  	addq	r5, r0, r0		C cbit += cbit2
   157  	lda	r16, 8(r16)		C src++
   158  	lda	r17, -1(r17)		C size--
   159  
   160  	umulh	r4, r18, r19		C climb = q * d
   161  	bne	r17, L(top)		C while 2 or more limbs left
   162  
   163  
   164  
   165  	C r0	cbit
   166  	C r18	d
   167  	C r19	climb
   168  	C r20	inv
   169  
   170  	ldq	r1, 0(r16)		C s = src[size-1] high limb
   171  
   172  	cmpult	r1, r18, r2		C test high<divisor
   173  	bne	r2, L(skip)		C skip if so
   174  
   175  	C can't skip a division, repeat loop code
   176  
   177  	subq	r1, r0, r2		C x = s - cbit
   178  	cmpult	r1, r0, r0		C new cbit = s < cbit
   179  
   180  	subq	r2, r19, r3		C y = x - climb
   181  
   182  	mulq	r3, r20, r4		C q = y * inv
   183  L(one):
   184  	cmpult	r2, r19, r5		C cbit2 = x < climb
   185  	addq	r5, r0, r0		C cbit += cbit2
   186  
   187  	umulh	r4, r18, r19		C climb = q * d
   188  
   189  	addq	r19, r0, r0		C return climb + cbit
   190  	ret	r31, (r26), 1
   191  
   192  
   193  	ALIGN(8)
   194  L(skip):
   195  	C with high<divisor, the final step can be just (cbit+climb)-s and
   196  	C an addback of d if that underflows
   197  
   198  	addq	r19, r0, r19		C c = climb + cbit
   199  
   200  	subq	r19, r1, r2		C c - s
   201  	cmpult	r19, r1, r3		C c < s
   202  
   203  	addq	r2, r18, r0		C return c-s + divisor
   204  
   205  	cmoveq	r3, r2, r0		C return c-s if no underflow
   206  	ret	r31, (r26), 1
   207  
   208  EPILOGUE()
   209  ASM_END()