github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/pre_mod_1.asm (about)

     1  dnl  AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
     2  
     3  dnl  Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6: 18.0 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
    38  C                             mp_limb_t inverse);
    39  C
    40  C This code is only 2 c/l faster than a simple divl, but that's 10% so it's
    41  C considered worthwhile (just).
    42  
    43  defframe(PARAM_INVERSE,16)
    44  defframe(PARAM_DIVISOR,12)
    45  defframe(PARAM_SIZE,    8)
    46  defframe(PARAM_SRC,     4)
    47  
    48  	TEXT
    49  	ALIGN(32)
    50  PROLOGUE(mpn_preinv_mod_1)
    51  deflit(`FRAME',0)
    52  
    53  	ASSERT(ae,`cmpl $1, PARAM_SIZE')
    54  	ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR')
    55  
    56  	movl	PARAM_SIZE, %ecx
    57  	pushl	%ebp	FRAME_pushl()
    58  
    59  	movl	PARAM_SRC, %ebp
    60  	pushl	%edi	FRAME_pushl()
    61  
    62  	movl	PARAM_DIVISOR, %eax
    63  	pushl	%esi	FRAME_pushl()
    64  
    65  	movl	-4(%ebp,%ecx,4), %esi	C src high limb
    66  	pushl	%ebx	FRAME_pushl()
    67  
    68  	movl	%edx, %edi		C first n2 to cancel
    69  	subl	%eax, %esi		C first n1 = high-divisor
    70  
    71  	decl	%ecx
    72  	jz	L(done_sbbl)
    73  
    74  L(top):
    75  	C eax	scratch
    76  	C ebx	n10, nadj, q1
    77  	C ecx	counter, size to 1
    78  	C edx	scratch
    79  	C esi	n2
    80  	C edi	old high, for underflow test
    81  	C ebp	src
    82  
    83  	sbbl	%edx, %edi	    C high n-(q1+1)*d, 0 or -1
    84  
    85  L(entry):
    86  	andl	PARAM_DIVISOR, %edi
    87  L(q1_ff_top):
    88  	movl	-4(%ebp,%ecx,4), %ebx
    89  
    90  	addl	%esi, %edi	    C possible addback
    91  	movl	%ebx, %esi	    C n10
    92  
    93  	sarl	$31, %ebx	    C -n1 = 0 or -1
    94  	movl	%edi, %eax	    C n2
    95  
    96  	movl	PARAM_INVERSE, %edx
    97  	subl	%ebx, %eax	    C n2+n1
    98  
    99  	mull	%edx		    C m*(n2+n1)
   100  
   101  	andl	PARAM_DIVISOR, %ebx C -n1 & d
   102  	addl	%esi, %ebx	    C nadj = n10 + (-n1&d), ignoring overflow
   103  
   104  	addl	%ebx, %eax	    C low m*(n2+n1) + nadj, giving carry flag
   105  	leal	1(%edi), %ebx	    C n2+1
   106  
   107  	adcl	%ebx, %edx	    C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1
   108  
   109  	movl	PARAM_DIVISOR, %eax C d
   110  	jz	L(q1_ff)
   111  
   112  	mull	%edx		    C (q1+1)*d
   113  
   114  	subl	%eax, %esi	    C low  n-(q1+1)*d
   115  	loop	L(top)
   116  
   117  
   118  
   119  L(done_sbbl):
   120  	sbbl	%edx, %edi	    C high n-(q1+1)*d, 0 or -1
   121  
   122  	andl	PARAM_DIVISOR, %edi
   123  L(done_esi_edi):
   124  	popl	%ebx
   125  
   126  	leal	(%esi,%edi), %eax
   127  	popl	%esi
   128  
   129  	popl	%edi
   130  	popl	%ebp
   131  
   132  	ret
   133  
   134  
   135  C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
   136  C of q*d is simply -d and the remainder n-q*d = n10+d.  This is rarely
   137  C reached.
   138  
   139  L(q1_ff):
   140  	movl	PARAM_DIVISOR, %edi
   141  	loop	L(q1_ff_top)
   142  
   143  	jmp	L(done_esi_edi)
   144  
   145  
   146  EPILOGUE()