github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/divrem_2.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/divrem_2.asm (about)

     1  dnl  Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
     2  
     3  dnl  Copyright 2007, 2008, 2013 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C		norm	frac
    34  C ev4
    35  C ev5		70	70
    36  C ev6		29	29
    37  
    38  C TODO
    39  C  * Perhaps inline mpn_invert_limb, that would allow us to not save/restore
    40  C    any registers (thus save ~10 cycles per call).
    41  C  * Use negated d1 and/or d0 to speed carry propagation.  Might save a cycle
    42  C    or two.
    43  C  * Check cluster delays (for ev6).  We very likely could save some cycles.
    44  C  * Use branch-free code for computing di.
    45  C  * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call.
    46  
    47  C INPUT PARAMETERS
    48  define(`qp',		`r16')
    49  define(`fn',		`r17')
    50  define(`up_param',	`r18')
    51  define(`un_param',	`r19')
    52  define(`dp',		`r20')
    53  
    54  ASM_START()
    55  PROLOGUE(mpn_divrem_2,gp)
    56  	lda	r30, -80(r30)
    57  	stq	r26, 0(r30)
    58  	stq	r9, 8(r30)
    59  	stq	r10, 16(r30)
    60  	stq	r11, 24(r30)
    61  	stq	r12, 32(r30)
    62  	stq	r13, 40(r30)
    63  C	stq	r14, 48(r30)
    64  	stq	r15, 56(r30)
    65  	.prologue	1
    66  	stq	r16, 64(r30)
    67  	bis	r31, r17, r15
    68  	s8addq	r19, r18, r13
    69  	lda	r13, -24(r13)
    70  	ldq	r12, 8(r20)
    71  	ldq	r10, 0(r20)
    72  	ldq	r11, 16(r13)
    73  	ldq	r9, 8(r13)
    74  
    75  	bis	r31, r31, r3		C most_significant_q_limb = 0
    76  	cmpult	r11, r12, r1
    77  	bne	r1, L(L8)
    78  	cmpule	r11, r12, r1
    79  	cmpult	r9, r10, r2
    80  	and	r1, r2, r1
    81  	bne	r1, L(L8)
    82  	subq	r11, r12, r11
    83  	subq	r11, r2, r11
    84  	subq	r9, r10, r9
    85  	lda	r3, 1(r31)		C most_significant_q_limb = 1
    86  L(L8):	stq	r3, 72(r30)
    87  
    88  	addq	r15, r19, r19
    89  	lda	r19, -3(r19)
    90  	blt	r19, L(L10)
    91  	bis	r31, r12, r16
    92  	jsr	r26, mpn_invert_limb
    93  	LDGP(	r29, 0(r26))
    94  	mulq	r0, r12, r4		C t0 = LO(di * d1)
    95  	umulh	r0, r10, r2		C s1 = HI(di * d0)
    96  	addq	r4, r10, r4		C t0 += d0
    97  	cmpule	r10, r4, r7		C (t0 < d0)
    98  	addq	r4, r2, r4		C t0 += s1
    99  	cmpult	r4, r2, r1
   100  	subq	r1, r7, r7		C t1 (-1, 0, or 1)
   101  	blt	r7, L(L42)
   102  L(L22):
   103  	lda	r0, -1(r0)		C di--
   104  	cmpult	r4, r12, r1		C cy for: t0 -= d1 (below)
   105  	subq	r7, r1, r7		C t1 -= cy
   106  	subq	r4, r12, r4		C t0 -= d1
   107  	bge	r7, L(L22)
   108  L(L42):
   109  	ldq	r16, 64(r30)
   110  	s8addq	r19, r16, r16
   111  	ALIGN(16)
   112  L(loop):
   113  	mulq	r11, r0, r5		C q0 (early)
   114  	umulh	r11, r0, r6		C q  (early)
   115  	addq	r5, r9, r8		C q0 += n1
   116  	addq	r6, r11, r6		C q  += n2
   117  	cmpult	r8, r5, r1		C cy for: q0 += n1
   118  	addq	r6, r1, r6		C q  += cy
   119  	unop
   120  	mulq	r12, r6, r1		C LO(d1 * q)
   121  	umulh	r10, r6, r7		C t1 = HI(d0 * q)
   122  	subq	r9, r1, r9		C n1 -= LO(d1 * q)
   123  	mulq	r10, r6, r4		C t0 = LO(d0 * q)
   124  	unop
   125  	cmple	r15, r19, r5		C condition and n0...
   126  	beq	r5, L(L31)
   127  	ldq	r5, 0(r13)
   128  	lda	r13, -8(r13)
   129  L(L31):	subq	r9, r12, r9		C n1 -= d1
   130  	cmpult	r5, r10, r1		C
   131  	subq	r9, r1, r9		C
   132  	subq	r5, r10, r5		C n0 -= d0
   133  	subq	r9, r7, r9		C n1 -= t0
   134  	cmpult	r5, r4, r1		C
   135  	subq	r9, r1, r2		C
   136  	subq	r5, r4, r5		C n0 -= t1
   137  	cmpult	r2, r8, r1		C (n1 < q0)
   138  	addq	r6, r1, r6		C q += cond
   139  	lda	r1, -1(r1)		C -(n1 >= q0)
   140  	and	r1, r10, r4		C
   141  	addq	r5, r4, r9		C n0 += mask & d0
   142  	and	r1, r12, r1		C
   143  	cmpult	r9, r5, r11		C cy for: n0 += mask & d0
   144  	addq	r2, r1, r1		C n1 += mask & d1
   145  	addq	r1, r11, r11		C n1 += cy
   146  	cmpult	r11, r12, r1		C
   147  	beq	r1, L(fix)		C
   148  L(bck):	stq	r6, 0(r16)
   149  	lda	r16, -8(r16)
   150  	lda	r19, -1(r19)
   151  	bge	r19, L(loop)
   152  
   153  L(L10):	stq	r9, 8(r13)
   154  	stq	r11, 16(r13)
   155  	ldq	r0, 72(r30)
   156  	ldq	r26, 0(r30)
   157  	ldq	r9, 8(r30)
   158  	ldq	r10, 16(r30)
   159  	ldq	r11, 24(r30)
   160  	ldq	r12, 32(r30)
   161  	ldq	r13, 40(r30)
   162  C	ldq	r14, 48(r30)
   163  	ldq	r15, 56(r30)
   164  	lda	r30, 80(r30)
   165  	ret	r31, (r26), 1
   166  
   167  L(fix):	cmpule	r11, r12, r1
   168  	cmpult	r9, r10, r2
   169  	and	r1, r2, r1
   170  	bne	r1, L(bck)
   171  	subq	r11, r12, r11
   172  	subq	r11, r2, r11
   173  	subq	r9, r10, r9
   174  	lda	r6, 1(r6)
   175  	br	L(bck)
   176  EPILOGUE()
   177  ASM_END()