github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/bdiv_dbm1c.asm (about)

     1  dnl  Alpha mpn_bdiv_dbm1c.
     2  
     3  dnl  Copyright 2008 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C      cycles/limb
    34  C EV4:     42
    35  C EV5:     18
    36  C EV6:      3
    37  
    38  C TODO
    39  C  * Try less unrolling, 2-way should give the same performance.
    40  C  * Optimize feed-in and wind-down code, for speed, and perhaps further for
    41  C    code size.
    42  C  * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
    43  C    path.  We have not tried very hard to find a better algorithm.  Perhaps
    44  C    it would be a good task for the GNU superoptimizer.
    45  
    46  C INPUT PARAMETERS
    47  define(`rp', `r16')
    48  define(`up', `r17')
    49  define(`n',  `r18')
    50  define(`bd', `r19')
    51  define(`cy', `r19')
    52  
    53  
    54  ASM_START()
    55  PROLOGUE(mpn_bdiv_dbm1c)
    56  	mov	r20, r8
    57  
    58  	ldq	r24, 0(r17)
    59  	and	r18, 3, r28
    60  	lda	r18, -4(r18)
    61  	beq	r28, L(b0)
    62  	cmpeq	r28, 1, r21
    63  	bne	r21, L(b1)
    64  	cmpeq	r28, 2, r21
    65  	bne	r21, L(b2)
    66  
    67  
    68  L(b3):	ldq	r2, 8(r17)
    69  	ldq	r3, 16(r17)
    70  	bgt	r18, L(gt3)
    71  
    72  	mulq	r24, r19, r5	C U1
    73  	umulh	r24, r19, r21	C U1
    74  	mulq	r2, r19, r6	C U1
    75  	umulh	r2, r19, r22	C U1
    76  	mulq	r3, r19, r7	C U1
    77  	umulh	r3, r19, r23	C U1
    78  	lda	r16, -32(r16)
    79  	br	L(cj3)
    80  
    81  L(gt3):	ldq	r0, 24(r17)
    82  	mulq	r24, r19, r5	C U1
    83  	umulh	r24, r19, r21	C U1
    84  	ldq	r1, 32(r17)
    85  	mulq	r2, r19, r6	C U1
    86  	umulh	r2, r19, r22	C U1
    87  	ldq	r2, 40(r17)
    88  	mulq	r3, r19, r7	C U1
    89  	umulh	r3, r19, r23	C U1
    90  	ldq	r3, 48(r17)
    91  	lda	r18, -4(r18)
    92  	lda	r17, 56(r17)
    93  	mulq	r0, r19, r4	C U1
    94  	bgt	r18, L(L3)
    95  
    96  	br	L(cj7)
    97  
    98  
    99  L(b2):	ldq	r3, 8(r17)
   100  	bgt	r18, L(gt2)
   101  
   102  	mulq	r24, r19, r6	C U1
   103  	umulh	r24, r19, r22	C U1
   104  	mulq	r3, r19, r7	C U1
   105  	umulh	r3, r19, r23	C U1
   106  	lda	r16, -40(r16)
   107  	br	L(cj2)
   108  
   109  L(gt2):	ldq	r0, 16(r17)
   110  	ldq	r1, 24(r17)
   111  	mulq	r24, r19, r6	C U1
   112  	umulh	r24, r19, r22	C U1
   113  	ldq	r2, 32(r17)
   114  	mulq	r3, r19, r7	C U1
   115  	umulh	r3, r19, r23	C U1
   116  	ldq	r3, 40(r17)
   117  	lda	r18, -4(r18)
   118  	lda	r17, 48(r17)
   119  	mulq	r0, r19, r4	C U1
   120  	umulh	r0, r19, r20	C U1
   121  	lda	r16, -8(r16)
   122  	bgt	r18, L(gt6)
   123  
   124  	mulq	r1, r19, r5	C U1
   125  	br	L(cj6)
   126  
   127  L(gt6):	ldq	r0, 0(r17)
   128  	mulq	r1, r19, r5	C U1
   129  	br	L(L2)
   130  
   131  
   132  L(b1):	bgt	r18, L(gt1)
   133  
   134  	mulq	r24, r19, r7	C U1
   135  	umulh	r24, r19, r23	C U1
   136  	lda	r16, -48(r16)
   137  	br	L(cj1)
   138  
   139  L(gt1):	ldq	r0, 8(r17)
   140  	ldq	r1, 16(r17)
   141  	ldq	r2, 24(r17)
   142  	mulq	r24, r19, r7	C U1
   143  	umulh	r24, r19, r23	C U1
   144  	ldq	r3, 32(r17)
   145  	lda	r18, -4(r18)
   146  	lda	r17, 40(r17)
   147  	mulq	r0, r19, r4	C U1
   148  	umulh	r0, r19, r20	C U1
   149  	lda	r16, -16(r16)
   150  	bgt	r18, L(gt5)
   151  
   152  	mulq	r1, r19, r5	C U1
   153  	umulh	r1, r19, r21	C U1
   154  	mulq	r2, r19, r6	C U1
   155  	br	L(cj5)
   156  
   157  L(gt5):	ldq	r0, 0(r17)
   158  	mulq	r1, r19, r5	C U1
   159  	umulh	r1, r19, r21	C U1
   160  	ldq	r1, 8(r17)
   161  	mulq	r2, r19, r6	C U1
   162  	br	L(L1)
   163  
   164  
   165  L(b0):	ldq	r1, 8(r17)
   166  	ldq	r2, 16(r17)
   167  	ldq	r3, 24(r17)
   168  	lda	r17, 32(r17)
   169  	lda	r16, -24(r16)
   170  	mulq	r24, r19, r4	C U1
   171  	umulh	r24, r19, r20	C U1
   172  	bgt	r18, L(gt4)
   173  
   174  	mulq	r1, r19, r5	C U1
   175  	umulh	r1, r19, r21	C U1
   176  	mulq	r2, r19, r6	C U1
   177  	umulh	r2, r19, r22	C U1
   178  	mulq	r3, r19, r7	C U1
   179  	br	L(cj4)
   180  
   181  L(gt4):	ldq	r0, 0(r17)
   182  	mulq	r1, r19, r5	C U1
   183  	umulh	r1, r19, r21	C U1
   184  	ldq	r1, 8(r17)
   185  	mulq	r2, r19, r6	C U1
   186  	umulh	r2, r19, r22	C U1
   187  	ldq	r2, 16(r17)
   188  	mulq	r3, r19, r7	C U1
   189  	br	L(L0)
   190  
   191  C *** MAIN LOOP START ***
   192  	ALIGN(16)
   193  L(top):	mulq	r0, r19, r4	C U1
   194  	subq	r8, r28, r8
   195  L(L3):	umulh	r0, r19, r20	C U1
   196  	cmpult	r8, r5, r28
   197  	ldq	r0, 0(r17)
   198  	subq	r8, r5, r8
   199  	addq	r21, r28, r28
   200  	stq	r8, 0(r16)
   201  
   202  	mulq	r1, r19, r5	C U1
   203  	subq	r8, r28, r8
   204  L(L2):	umulh	r1, r19, r21	C U1
   205  	cmpult	r8, r6, r28
   206  	ldq	r1, 8(r17)
   207  	subq	r8, r6, r8
   208  	addq	r22, r28, r28
   209  	stq	r8, 8(r16)
   210  
   211  	mulq	r2, r19, r6	C U1
   212  	subq	r8, r28, r8
   213  L(L1):	umulh	r2, r19, r22	C U1
   214  	cmpult	r8, r7, r28
   215  	ldq	r2, 16(r17)
   216  	subq	r8, r7, r8
   217  	addq	r23, r28, r28
   218  	stq	r8, 16(r16)
   219  
   220  	mulq	r3, r19, r7	C U1
   221  	subq	r8, r28, r8
   222  L(L0):	umulh	r3, r19, r23	C U1
   223  	cmpult	r8, r4, r28
   224  	ldq	r3, 24(r17)
   225  	subq	r8, r4, r8
   226  	addq	r20, r28, r28
   227  	stq	r8, 24(r16)
   228  
   229  	lda	r18, -4(r18)
   230  	lda	r17, 32(r17)
   231  	lda	r16, 32(r16)
   232  	bgt	r18, L(top)
   233  C *** MAIN LOOP END ***
   234  
   235  	mulq	r0, r19, r4	C U1
   236  	subq	r8, r28, r8
   237  L(cj7):	umulh	r0, r19, r20	C U1
   238  	cmpult	r8, r5, r28
   239  	subq	r8, r5, r8
   240  	addq	r21, r28, r28
   241  	stq	r8, 0(r16)
   242  	mulq	r1, r19, r5	C U1
   243  	subq	r8, r28, r8
   244  L(cj6):	umulh	r1, r19, r21	C U1
   245  	cmpult	r8, r6, r28
   246  	subq	r8, r6, r8
   247  	addq	r22, r28, r28
   248  	stq	r8, 8(r16)
   249  	mulq	r2, r19, r6	C U1
   250  	subq	r8, r28, r8
   251  L(cj5):	umulh	r2, r19, r22	C U1
   252  	cmpult	r8, r7, r28
   253  	subq	r8, r7, r8
   254  	addq	r23, r28, r28
   255  	stq	r8, 16(r16)
   256  	mulq	r3, r19, r7	C U1
   257  	subq	r8, r28, r8
   258  L(cj4):	umulh	r3, r19, r23	C U1
   259  	cmpult	r8, r4, r28
   260  	subq	r8, r4, r8
   261  	addq	r20, r28, r28
   262  	stq	r8, 24(r16)
   263  	subq	r8, r28, r8
   264  L(cj3):	cmpult	r8, r5, r28
   265  	subq	r8, r5, r8
   266  	addq	r21, r28, r28
   267  	stq	r8, 32(r16)
   268  	subq	r8, r28, r8
   269  L(cj2):	cmpult	r8, r6, r28
   270  	subq	r8, r6, r8
   271  	addq	r22, r28, r28
   272  	stq	r8, 40(r16)
   273  	subq	r8, r28, r8
   274  L(cj1):	cmpult	r8, r7, r28
   275  	subq	r8, r7, r8
   276  	addq	r23, r28, r28
   277  	stq	r8, 48(r16)
   278  	subq	r8, r28, r0
   279  	ret	r31, (r26), 1
   280  
   281  EPILOGUE()
   282  ASM_END()