github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/bdiv_q_1.asm (about)

     1  dnl  AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
     2  
     3  dnl  Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
     4  
     5  dnl  Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C          cycles/limb
    37  C Athlon:     11.0
    38  C Hammer:      9.0
    39  
    40  
    41  C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    42  C                      mp_limb_t divisor);
    43  C
    44  C The dependent chain is mul+imul+sub for 11 cycles and that speed is
    45  C achieved with no special effort.  The load and shrld latencies are hidden
    46  C by out of order execution.
    47  C
    48  C It's a touch faster on size==1 to use the mul-by-inverse than divl.
    49  
    50  defframe(PARAM_SHIFT,  24)
    51  defframe(PARAM_INVERSE,20)
    52  defframe(PARAM_DIVISOR,16)
    53  defframe(PARAM_SIZE,   12)
    54  defframe(PARAM_SRC,    8)
    55  defframe(PARAM_DST,    4)
    56  
    57  defframe(SAVE_EBX,     -4)
    58  defframe(SAVE_ESI,     -8)
    59  defframe(SAVE_EDI,    -12)
    60  defframe(SAVE_EBP,    -16)
    61  defframe(VAR_INVERSE, -20)
    62  defframe(VAR_DST_END, -24)
    63  
    64  deflit(STACK_SPACE, 24)
    65  
    66  	TEXT
    67  
    68  C mp_limb_t
    69  C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
    70  C		    mp_limb_t inverse, int shift)
    71  	ALIGN(16)
    72  PROLOGUE(mpn_pi1_bdiv_q_1)
    73  deflit(`FRAME',0)
    74  
    75  	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
    76  	movl	PARAM_SHIFT, %ecx	C shift count
    77  
    78  	movl	%ebp, SAVE_EBP
    79  	movl	PARAM_SIZE, %ebp
    80  
    81  	movl	%esi, SAVE_ESI
    82  	movl	PARAM_SRC, %esi
    83  
    84  	movl	%edi, SAVE_EDI
    85  	movl	PARAM_DST, %edi
    86  
    87  	movl	%ebx, SAVE_EBX
    88  
    89  	leal	(%esi,%ebp,4), %esi	C src end
    90  	leal	(%edi,%ebp,4), %edi	C dst end
    91  	negl	%ebp			C -size
    92  
    93  	movl	PARAM_INVERSE, %eax	C inv
    94  
    95  L(common):
    96  	movl	%eax, VAR_INVERSE
    97  	movl	(%esi,%ebp,4), %eax	C src[0]
    98  
    99  	incl	%ebp
   100  	jz	L(one)
   101  
   102  	movl	(%esi,%ebp,4), %edx	C src[1]
   103  
   104  	shrdl(	%cl, %edx, %eax)
   105  
   106  	movl	%edi, VAR_DST_END
   107  	xorl	%ebx, %ebx
   108  	jmp	L(entry)
   109  
   110  	ALIGN(8)
   111  L(top):
   112  	C eax	q
   113  	C ebx	carry bit, 0 or 1
   114  	C ecx	shift
   115  	C edx
   116  	C esi	src end
   117  	C edi	dst end
   118  	C ebp	counter, limbs, negative
   119  
   120  	mull	PARAM_DIVISOR		C carry limb in edx
   121  
   122  	movl	-4(%esi,%ebp,4), %eax
   123  	movl	(%esi,%ebp,4), %edi
   124  
   125  	shrdl(	%cl, %edi, %eax)
   126  
   127  	subl	%ebx, %eax		C apply carry bit
   128  	setc	%bl
   129  	movl	VAR_DST_END, %edi
   130  
   131  	subl	%edx, %eax		C apply carry limb
   132  	adcl	$0, %ebx
   133  
   134  L(entry):
   135  	imull	VAR_INVERSE, %eax
   136  
   137  	movl	%eax, -4(%edi,%ebp,4)
   138  	incl	%ebp
   139  	jnz	L(top)
   140  
   141  
   142  	mull	PARAM_DIVISOR		C carry limb in edx
   143  
   144  	movl	-4(%esi), %eax		C src high limb
   145  	shrl	%cl, %eax
   146  	movl	SAVE_ESI, %esi
   147  
   148  	subl	%ebx, %eax		C apply carry bit
   149  	movl	SAVE_EBX, %ebx
   150  	movl	SAVE_EBP, %ebp
   151  
   152  	subl	%edx, %eax		C apply carry limb
   153  
   154  	imull	VAR_INVERSE, %eax
   155  
   156  	movl	%eax, -4(%edi)
   157  	movl	SAVE_EDI, %edi
   158  	addl	$STACK_SPACE, %esp
   159  
   160  	ret
   161  
   162  L(one):
   163  	shrl	%cl, %eax
   164  	movl	SAVE_ESI, %esi
   165  	movl	SAVE_EBX, %ebx
   166  
   167  	imull	VAR_INVERSE, %eax
   168  
   169  	movl	SAVE_EBP, %ebp
   170  
   171  	movl	%eax, -4(%edi)
   172  	movl	SAVE_EDI, %edi
   173  	addl	$STACK_SPACE, %esp
   174  
   175  	ret
   176  EPILOGUE()
   177  
   178  C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
   179  C                           mp_limb_t divisor);
   180  C
   181  
   182  	ALIGN(16)
   183  PROLOGUE(mpn_bdiv_q_1)
   184  deflit(`FRAME',0)
   185  
   186  	movl	PARAM_DIVISOR, %eax
   187  	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
   188  	movl	$-1, %ecx		C shift count
   189  
   190  	movl	%ebp, SAVE_EBP
   191  	movl	PARAM_SIZE, %ebp
   192  
   193  	movl	%esi, SAVE_ESI
   194  	movl	%edi, SAVE_EDI
   195  
   196  	C If there's usually only one or two trailing zero bits then this
   197  	C should be faster than bsfl.
   198  L(strip_twos):
   199  	incl	%ecx
   200  	shrl	%eax
   201  	jnc	L(strip_twos)
   202  
   203  	movl	%ebx, SAVE_EBX
   204  	leal	1(%eax,%eax), %ebx	C d without twos
   205  	andl	$127, %eax		C d/2, 7 bits
   206  
   207  ifdef(`PIC',`
   208  	LEA(	binvert_limb_table, %edx)
   209  	movzbl	(%eax,%edx), %eax		C inv 8 bits
   210  ',`
   211  	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
   212  ')
   213  
   214  	leal	(%eax,%eax), %edx	C 2*inv
   215  	movl	%ebx, PARAM_DIVISOR	C d without twos
   216  
   217  	imull	%eax, %eax		C inv*inv
   218  
   219  	movl	PARAM_SRC, %esi
   220  	movl	PARAM_DST, %edi
   221  
   222  	imull	%ebx, %eax		C inv*inv*d
   223  
   224  	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
   225  	leal	(%edx,%edx), %eax	C 2*inv
   226  
   227  	imull	%edx, %edx		C inv*inv
   228  
   229  	leal	(%esi,%ebp,4), %esi	C src end
   230  	leal	(%edi,%ebp,4), %edi	C dst end
   231  	negl	%ebp			C -size
   232  
   233  	imull	%ebx, %edx		C inv*inv*d
   234  
   235  	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
   236  
   237  	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
   238  	pushl	%eax	FRAME_pushl()
   239  	imull	PARAM_DIVISOR, %eax
   240  	cmpl	$1, %eax
   241  	popl	%eax	FRAME_popl()')
   242  
   243  	jmp	L(common)
   244  EPILOGUE()
   245  ASM_END()