github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/divrem_1.asm (about)

     1  dnl  AMD K6 mpn_divrem_1 -- mpn by limb division.
     2  
     3  dnl  Copyright 1999-2003, 2007 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6: 20 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
    38  C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
    39  C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
    40  C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
    41  C                          mp_limb_t carry);
    42  C
    43  C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
    44  C instead of decl+jnz, since it comes out 2 cycles/limb faster.
    45  C
    46  C A test is done to see if the high limb is less than the divisor, and if so
    47  C one less div is done.  A div is 20 cycles, so assuming high<divisor about
    48  C half the time, then this test saves half that amount.  The branch
    49  C misprediction penalty is less than that.
    50  C
    51  C Back-to-back div instructions run at 20 cycles, the same as the loop here,
    52  C so it seems there's nothing to gain by rearranging the loop.  Pairing the
    53  C mov and loop instructions was found to gain nothing.
    54  C
    55  C Enhancements:
    56  C
    57  C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
    58  C that algorithm has been found to suffer from the relatively poor carry
    59  C handling on K6 and too many auxiliary instructions.  The fractional part
    60  C however could be done at about 13 c/l, if it mattered enough.
    61  
    62  defframe(PARAM_CARRY,  24)
    63  defframe(PARAM_DIVISOR,20)
    64  defframe(PARAM_SIZE,   16)
    65  defframe(PARAM_SRC,    12)
    66  defframe(PARAM_XSIZE,  8)
    67  defframe(PARAM_DST,    4)
    68  
    69  	TEXT
    70  
    71  	ALIGN(32)
    72  PROLOGUE(mpn_divrem_1c)
    73  deflit(`FRAME',0)
    74  
    75  	movl	PARAM_SIZE, %ecx
    76  	pushl	%edi		FRAME_pushl()
    77  
    78  	movl	PARAM_SRC, %edi
    79  	pushl	%esi		FRAME_pushl()
    80  
    81  	movl	PARAM_DIVISOR, %esi
    82  	pushl	%ebx		FRAME_pushl()
    83  
    84  	movl	PARAM_DST, %ebx
    85  	pushl	%ebp		FRAME_pushl()
    86  
    87  	movl	PARAM_XSIZE, %ebp
    88  	orl	%ecx, %ecx		C size
    89  
    90  	movl	PARAM_CARRY, %edx
    91  	jz	L(fraction)		C if size==0
    92  
    93  	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
    94  	jmp	L(integer_top)
    95  
    96  EPILOGUE()
    97  
    98  
    99  	ALIGN(16)
   100  PROLOGUE(mpn_divrem_1)
   101  deflit(`FRAME',0)
   102  
   103  	movl	PARAM_SIZE, %ecx
   104  	pushl	%edi		FRAME_pushl()
   105  
   106  	movl	PARAM_SRC, %edi
   107  	pushl	%esi		FRAME_pushl()
   108  
   109  	movl	PARAM_DIVISOR, %esi
   110  	orl	%ecx,%ecx		C size
   111  
   112  	jz	L(size_zero)
   113  	pushl	%ebx		FRAME_pushl()
   114  
   115  	movl	-4(%edi,%ecx,4), %eax	C src high limb
   116  	xorl	%edx, %edx
   117  
   118  	movl	PARAM_DST, %ebx
   119  	pushl	%ebp		FRAME_pushl()
   120  
   121  	movl	PARAM_XSIZE, %ebp
   122  	cmpl	%esi, %eax
   123  
   124  	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
   125  	jae	L(integer_entry)
   126  
   127  
   128  	C high<divisor, so high of dst is zero, and avoid one div
   129  
   130  	movl	%edx, (%ebx,%ecx,4)
   131  	decl	%ecx
   132  
   133  	movl	%eax, %edx
   134  	jz	L(fraction)
   135  
   136  
   137  L(integer_top):
   138  	C eax	scratch (quotient)
   139  	C ebx	dst+4*xsize-4
   140  	C ecx	counter
   141  	C edx	scratch (remainder)
   142  	C esi	divisor
   143  	C edi	src
   144  	C ebp	xsize
   145  
   146  	movl	-4(%edi,%ecx,4), %eax
   147  L(integer_entry):
   148  
   149  	divl	%esi
   150  
   151  	movl	%eax, (%ebx,%ecx,4)
   152  	loop	L(integer_top)
   153  
   154  
   155  L(fraction):
   156  	orl	%ebp, %ecx
   157  	jz	L(done)
   158  
   159  	movl	PARAM_DST, %ebx
   160  
   161  
   162  L(fraction_top):
   163  	C eax	scratch (quotient)
   164  	C ebx	dst
   165  	C ecx	counter
   166  	C edx	scratch (remainder)
   167  	C esi	divisor
   168  	C edi
   169  	C ebp
   170  
   171  	xorl	%eax, %eax
   172  
   173  	divl	%esi
   174  
   175  	movl	%eax, -4(%ebx,%ecx,4)
   176  	loop	L(fraction_top)
   177  
   178  
   179  L(done):
   180  	popl	%ebp
   181  	movl	%edx, %eax
   182  	popl	%ebx
   183  	popl	%esi
   184  	popl	%edi
   185  	ret
   186  
   187  
   188  L(size_zero):
   189  deflit(`FRAME',8)
   190  	movl	PARAM_XSIZE, %ecx
   191  	xorl	%eax, %eax
   192  
   193  	movl	PARAM_DST, %edi
   194  
   195  	cld	C better safe than sorry, see mpn/x86/README
   196  
   197  	rep
   198  	stosl
   199  
   200  	popl	%esi
   201  	popl	%edi
   202  	ret
   203  EPILOGUE()