github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/dive_1.asm (about)

     1  dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
     2  
     3  dnl  Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C          cycles/limb
    35  C Athlon:     11.0
    36  C Hammer:      9.0
    37  
    38  
    39  C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    40  C                      mp_limb_t divisor);
    41  C
    42  C The dependent chain is mul+imul+sub for 11 cycles and that speed is
    43  C achieved with no special effort.  The load and shrld latencies are hidden
    44  C by out of order execution.
    45  C
    46  C It's a touch faster on size==1 to use the mul-by-inverse than divl.
    47  
    48  defframe(PARAM_DIVISOR,16)
    49  defframe(PARAM_SIZE,   12)
    50  defframe(PARAM_SRC,    8)
    51  defframe(PARAM_DST,    4)
    52  
    53  defframe(SAVE_EBX,     -4)
    54  defframe(SAVE_ESI,     -8)
    55  defframe(SAVE_EDI,    -12)
    56  defframe(SAVE_EBP,    -16)
    57  defframe(VAR_INVERSE, -20)
    58  defframe(VAR_DST_END, -24)
    59  
    60  deflit(STACK_SPACE, 24)
    61  
    62  	TEXT
    63  
    64  	ALIGN(16)
    65  PROLOGUE(mpn_divexact_1)
    66  deflit(`FRAME',0)
    67  
    68  	movl	PARAM_DIVISOR, %eax
    69  	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
    70  	movl	$-1, %ecx		C shift count
    71  
    72  	movl	%ebp, SAVE_EBP
    73  	movl	PARAM_SIZE, %ebp
    74  
    75  	movl	%esi, SAVE_ESI
    76  	movl	%edi, SAVE_EDI
    77  
    78  	C If there's usually only one or two trailing zero bits then this
    79  	C should be faster than bsfl.
    80  L(strip_twos):
    81  	incl	%ecx
    82  	shrl	%eax
    83  	jnc	L(strip_twos)
    84  
    85  	movl	%ebx, SAVE_EBX
    86  	leal	1(%eax,%eax), %ebx	C d without twos
    87  	andl	$127, %eax		C d/2, 7 bits
    88  
    89  ifdef(`PIC',`
    90  	LEA(	binvert_limb_table, %edx)
    91  	movzbl	(%eax,%edx), %eax		C inv 8 bits
    92  ',`
    93  	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
    94  ')
    95  
    96  	leal	(%eax,%eax), %edx	C 2*inv
    97  	movl	%ebx, PARAM_DIVISOR	C d without twos
    98  
    99  	imull	%eax, %eax		C inv*inv
   100  
   101  	movl	PARAM_SRC, %esi
   102  	movl	PARAM_DST, %edi
   103  
   104  	imull	%ebx, %eax		C inv*inv*d
   105  
   106  	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
   107  	leal	(%edx,%edx), %eax	C 2*inv
   108  
   109  	imull	%edx, %edx		C inv*inv
   110  
   111  	leal	(%esi,%ebp,4), %esi	C src end
   112  	leal	(%edi,%ebp,4), %edi	C dst end
   113  	negl	%ebp			C -size
   114  
   115  	imull	%ebx, %edx		C inv*inv*d
   116  
   117  	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
   118  
   119  	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
   120  	pushl	%eax	FRAME_pushl()
   121  	imull	PARAM_DIVISOR, %eax
   122  	cmpl	$1, %eax
   123  	popl	%eax	FRAME_popl()')
   124  
   125  	movl	%eax, VAR_INVERSE
   126  	movl	(%esi,%ebp,4), %eax	C src[0]
   127  
   128  	incl	%ebp
   129  	jz	L(one)
   130  
   131  	movl	(%esi,%ebp,4), %edx	C src[1]
   132  
   133  	shrdl(	%cl, %edx, %eax)
   134  
   135  	movl	%edi, VAR_DST_END
   136  	xorl	%ebx, %ebx
   137  	jmp	L(entry)
   138  
   139  	ALIGN(8)
   140  L(top):
   141  	C eax	q
   142  	C ebx	carry bit, 0 or 1
   143  	C ecx	shift
   144  	C edx
   145  	C esi	src end
   146  	C edi	dst end
   147  	C ebp	counter, limbs, negative
   148  
   149  	mull	PARAM_DIVISOR		C carry limb in edx
   150  
   151  	movl	-4(%esi,%ebp,4), %eax
   152  	movl	(%esi,%ebp,4), %edi
   153  
   154  	shrdl(	%cl, %edi, %eax)
   155  
   156  	subl	%ebx, %eax		C apply carry bit
   157  	setc	%bl
   158  	movl	VAR_DST_END, %edi
   159  
   160  	subl	%edx, %eax		C apply carry limb
   161  	adcl	$0, %ebx
   162  
   163  L(entry):
   164  	imull	VAR_INVERSE, %eax
   165  
   166  	movl	%eax, -4(%edi,%ebp,4)
   167  	incl	%ebp
   168  	jnz	L(top)
   169  
   170  
   171  	mull	PARAM_DIVISOR		C carry limb in edx
   172  
   173  	movl	-4(%esi), %eax		C src high limb
   174  	shrl	%cl, %eax
   175  	movl	SAVE_ESI, %esi
   176  
   177  	subl	%ebx, %eax		C apply carry bit
   178  	movl	SAVE_EBX, %ebx
   179  	movl	SAVE_EBP, %ebp
   180  
   181  	subl	%edx, %eax		C apply carry limb
   182  
   183  	imull	VAR_INVERSE, %eax
   184  
   185  	movl	%eax, -4(%edi)
   186  	movl	SAVE_EDI, %edi
   187  	addl	$STACK_SPACE, %esp
   188  
   189  	ret
   190  
   191  
   192  L(one):
   193  	shrl	%cl, %eax
   194  	movl	SAVE_ESI, %esi
   195  	movl	SAVE_EBX, %ebx
   196  
   197  	imull	VAR_INVERSE, %eax
   198  
   199  	movl	SAVE_EBP, %ebp
   200  	movl	%eax, -4(%edi)
   201  
   202  	movl	SAVE_EDI, %edi
   203  	addl	$STACK_SPACE, %esp
   204  
   205  	ret
   206  
   207  EPILOGUE()
   208  ASM_END()