github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/divrem_2.asm (about)

     1  dnl  PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
     2  
     3  dnl  Copyright 2007, 2008, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C		cycles/limb
    34  C		norm	frac
    35  C 7410		~36.5	~36.5
    36  C 744x, 745x	 29	 29
    37  
    38  C INPUT PARAMETERS
    39  C qp  = r3
    40  C fn  = r4
    41  C up  = r5
    42  C un  = r6
    43  C d   = r7
    44  
    45  C TODO
    46  C  * Decrease register usage.
    47  C  * Make sure mul operands and optimal for early-out.
    48  C  * Check that things work well for a shared library build.
    49  C  * Write an invert_limb, perhaps inline, perhaps as a private call.  Or at
    50  C    least vastly improve the current __udiv_qrnnd_c based code.
    51  
    52  
    53  ASM_START()
    54  PROLOGUE(mpn_divrem_2)
    55  	stwu	r1, -32(r1)
    56  	slwi	r0, r6, 2
    57  	add	r5, r5, r0
    58  	stmw	r28, 8(r1)
    59  	addi	r29, r5, -8		C up = up_param + un - 2
    60  	lwz	r10, 4(r7)
    61  	lwz	r12, 4(r29)
    62  	addi	r8, r3, -12
    63  	lwz	r7, 0(r7)
    64  	cmplw	cr7, r12, r10
    65  	lwz	r28, 0(r29)
    66  	blt-	cr7, L(2)
    67  	bgt+	cr7, L(4)
    68  	cmplw	cr7, r28, r7
    69  	blt-	cr7, L(2)
    70  L(4):	subfc	r28, r7, r28
    71  	subfe	r12, r10, r12
    72  	li	r3, 1
    73  	b	L(6)
    74  L(2):	li	r3, 0
    75  
    76  L(6):	add	r0, r4, r6
    77  	addic.	r30, r0, -2
    78  	ble-	cr0, L(ret)
    79  
    80  	slwi	r9, r0, 2
    81  	add	r8, r8, r9		C rp += un + fn
    82  	mtctr	r30
    83  
    84  C Compute di from d1
    85  	srwi	r11, r10, 16
    86  	nor	r0, r10, r10
    87  	divwu	r31, r0, r11
    88  	rlwinm	r5, r10, 0, 16, 31
    89  	mullw	r9, r11, r31
    90  	mullw	r6, r5, r31
    91  	subf	r0, r9, r0
    92  	slwi	r0, r0, 16
    93  	ori	r0, r0, 65535
    94  	cmplw	cr7, r0, r6
    95  	bge-	cr7, L(9)
    96  	add	r0, r0, r10
    97  	cmplw	cr7, r0, r10
    98  	cmplw	cr6, r6, r0
    99  	addi	r31, r31, -1		C q1--
   100  	crorc	28, 28, 25
   101  	bc+	12, 28, L(9)
   102  	addi	r31, r31, -1		C q1--
   103  	add	r0, r0, r10
   104  L(9):	subf	r0, r6, r0
   105  	divwu	r6, r0, r11
   106  	mullw	r9, r11, r6
   107  	mullw	r11, r5, r6
   108  	subf	r0, r9, r0
   109  	slwi	r0, r0, 16
   110  	ori	r0, r0, 65535
   111  	cmplw	cr7, r0, r11
   112  	bge-	cr7, L(13)
   113  	add	r0, r0, r10
   114  	cmplw	cr7, r0, r10
   115  	cmplw	cr6, r11, r0
   116  	addi	r6, r6, -1		C q0--
   117  	crorc	28, 28, 25
   118  	bc+	12, 28, L(13)
   119  C	add	r0, r0, r10		C final remainder
   120  	addi	r6, r6, -1		C q0--
   121  L(13):	rlwimi	r6, r31, 16, 0, 15	C assemble final quotient
   122  
   123  C Adjust di by including d0
   124  	mullw	r9, r10, r6		C t0 = LO(di * d1)
   125  	addc	r11, r9, r7
   126  	subfe	r0, r1, r1
   127  	mulhwu	r9, r6, r7		C s1 = HI(di * d0)
   128  	addc	r9, r11, r9
   129  	addze.	r0, r0
   130  	blt	cr0, L(17)
   131  L(18):	subfc	r9, r10, r9
   132  	addi	r6, r6, -1
   133  	addme.	r0, r0
   134  	bge+	cr0, L(18)
   135  L(17):
   136  
   137  C r0  r3  r4  r5  r6  r7  r8  r9 r10 r11 r12 r28 r29 r30 r31
   138  C     msl         di  d0  qp     d1          fn  up  un
   139  L(loop):
   140  	mullw	r0, r12, r6		C q0 = LO(n2 * di)
   141  	cmpw	cr7, r30, r4
   142  	addc	r31, r0, r28		C q0 += n1
   143  	mulhwu	r9, r12, r6		C q  = HI(n2 * di)
   144  	adde	r12, r9, r12		C q  += n2
   145  	addi	r30, r30, -1
   146  	mullw	r0, r10, r12		C d1 * q
   147  	li	r9, 0
   148  	subf	r0, r0, r28		C n1 -= d1 * q
   149  	addi	r5, r12, 1
   150  	ble-	cr7, L(23)
   151  	lwzu	r9, -4(r29)
   152  L(23):	mullw	r11, r12, r7		C t0 = LO(d0 * q)
   153  	subfc	r28, r7, r9		C n0 -= d0
   154  	subfe	r0, r10, r0		C n1 -= d1
   155  	mulhwu	r12, r12, r7		C t1 = HI(d0 * q)
   156  	subfc	r28, r11, r28		C n0 -= t0
   157  	subfe	r12, r12, r0		C n1 -= t1
   158  	cmplw	cr7, r12, r31
   159  	blt+	cr7, L(24)
   160  	addc	r28, r28, r7
   161  	adde	r12, r12, r10
   162  	addi	r5, r5, -1
   163  L(24):	cmplw	cr7, r12, r10
   164  	bge-	cr7, L(fix)
   165  L(bck):	stw	r5, 0(r8)
   166  	addi	r8, r8, -4
   167  	bdnz	L(loop)
   168  
   169  L(ret):	stw	r28, 0(r29)
   170  	stw	r12, 4(r29)
   171  	lmw	r28, 8(r1)
   172  	addi	r1, r1, 32
   173  	blr
   174  
   175  L(fix):	cmplw	cr6, r28, r7
   176  	bgt+	cr7, L(28)
   177  	blt-	cr6, L(bck)
   178  L(28):	subfc	r28, r7, r28
   179  	subfe	r12, r10, r12
   180  	addi	r5, r5, 1
   181  	b	L(bck)
   182  EPILOGUE()