github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/dive_1.asm (about)

     1  dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
     2  
     3  dnl  Copyright 2001, 2002, 2014 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C         divisor
    35  C       odd   even
    36  C P54:  24.5  30.5   cycles/limb
    37  C P55:  23.0  28.0
    38  
    39  
    40  C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    41  C                      mp_limb_t divisor);
    42  C
    43  C Plain divl is used for small sizes, since the inverse takes a while to
    44  C setup.  Multiplying works out faster for size>=3 when the divisor is odd,
    45  C or size>=4 when the divisor is even.  Actually on P55 size==2 for odd or
    46  C size==3 for even are about the same speed for both divl or mul, but the
    47  C former is used since it will use up less code cache.
    48  C
    49  C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
    50  C expected.  On P54 in the even case the shrdl pairing nonsense (see
    51  C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
    52  C further 1.5 slowdown for both odd and even.
    53  
    54  defframe(PARAM_DIVISOR,16)
    55  defframe(PARAM_SIZE,   12)
    56  defframe(PARAM_SRC,    8)
    57  defframe(PARAM_DST,    4)
    58  
    59  dnl  re-use parameter space
    60  define(VAR_INVERSE,`PARAM_DST')
    61  
    62  	TEXT
    63  
    64  	ALIGN(32)
    65  PROLOGUE(mpn_divexact_1)
    66  deflit(`FRAME',0)
    67  
    68  	movl	PARAM_DIVISOR, %eax
    69  	movl	PARAM_SIZE, %ecx
    70  
    71  	pushl	%esi		FRAME_pushl()
    72  	push	%edi		FRAME_pushl()
    73  
    74  	movl	PARAM_SRC, %esi
    75  	andl	$1, %eax
    76  
    77  	movl	PARAM_DST, %edi
    78  	addl	%ecx, %eax	C size if even, size+1 if odd
    79  
    80  	cmpl	$4, %eax
    81  	jae	L(mul_by_inverse)
    82  
    83  
    84  	xorl	%edx, %edx
    85  L(div_top):
    86  	movl	-4(%esi,%ecx,4), %eax
    87  
    88  	divl	PARAM_DIVISOR
    89  
    90  	movl	%eax, -4(%edi,%ecx,4)
    91  	decl	%ecx
    92  
    93  	jnz	L(div_top)
    94  
    95  	popl	%edi
    96  	popl	%esi
    97  
    98  	ret
    99  
   100  
   101  
   102  L(mul_by_inverse):
   103  	movl	PARAM_DIVISOR, %eax
   104  	movl	$-1, %ecx
   105  
   106  L(strip_twos):
   107  	ASSERT(nz, `orl %eax, %eax')
   108  	shrl	%eax
   109  	incl	%ecx			C shift count
   110  
   111  	jnc	L(strip_twos)
   112  
   113  	leal	1(%eax,%eax), %edx	C d
   114  	andl	$127, %eax		C d/2, 7 bits
   115  
   116  	pushl	%ebx		FRAME_pushl()
   117  	pushl	%ebp		FRAME_pushl()
   118  
   119  ifdef(`PIC',`dnl
   120  	LEA(	binvert_limb_table, %ebp)
   121  	movzbl	(%eax,%ebp), %eax		C inv 8 bits
   122  ',`
   123  	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
   124  ')
   125  
   126  	movl	%eax, %ebp		C inv
   127  	addl	%eax, %eax		C 2*inv
   128  
   129  	imull	%ebp, %ebp		C inv*inv
   130  
   131  	imull	%edx, %ebp		C inv*inv*d
   132  
   133  	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
   134  	movl	PARAM_SIZE, %ebx
   135  
   136  	movl	%eax, %ebp
   137  	addl	%eax, %eax		C 2*inv
   138  
   139  	imull	%ebp, %ebp		C inv*inv
   140  
   141  	imull	%edx, %ebp		C inv*inv*d
   142  
   143  	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
   144  	movl	%edx, PARAM_DIVISOR	C d without twos
   145  
   146  	leal	(%esi,%ebx,4), %esi	C src end
   147  	leal	(%edi,%ebx,4), %edi	C dst end
   148  
   149  	negl	%ebx			C -size
   150  
   151  	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
   152  	pushl	%eax	FRAME_pushl()
   153  	imull	PARAM_DIVISOR, %eax
   154  	cmpl	$1, %eax
   155  	popl	%eax	FRAME_popl()')
   156  
   157  	movl	%eax, VAR_INVERSE
   158  	xorl	%ebp, %ebp		C initial carry bit
   159  
   160  	movl	(%esi,%ebx,4), %eax	C src low limb
   161  	orl	%ecx, %ecx		C shift
   162  
   163  	movl	4(%esi,%ebx,4), %edx	C src second limb (for even)
   164  	jz	L(odd_entry)
   165  
   166  	shrdl(	%cl, %edx, %eax)
   167  
   168  	incl	%ebx
   169  	jmp	L(even_entry)
   170  
   171  
   172  	ALIGN(8)
   173  L(odd_top):
   174  	C eax	scratch
   175  	C ebx	counter, limbs, negative
   176  	C ecx
   177  	C edx
   178  	C esi	src end
   179  	C edi	dst end
   180  	C ebp	carry bit, 0 or -1
   181  
   182  	mull	PARAM_DIVISOR
   183  
   184  	movl	(%esi,%ebx,4), %eax
   185  	subl	%ebp, %edx
   186  
   187  	subl	%edx, %eax
   188  
   189  	sbbl	%ebp, %ebp
   190  
   191  L(odd_entry):
   192  	imull	VAR_INVERSE, %eax
   193  
   194  	movl	%eax, (%edi,%ebx,4)
   195  
   196  	incl	%ebx
   197  	jnz	L(odd_top)
   198  
   199  
   200  	popl	%ebp
   201  	popl	%ebx
   202  
   203  	popl	%edi
   204  	popl	%esi
   205  
   206  	ret
   207  
   208  
   209  L(even_top):
   210  	C eax	scratch
   211  	C ebx	counter, limbs, negative
   212  	C ecx	twos
   213  	C edx
   214  	C esi	src end
   215  	C edi	dst end
   216  	C ebp	carry bit, 0 or -1
   217  
   218  	mull	PARAM_DIVISOR
   219  
   220  	subl	%ebp, %edx		C carry bit
   221  	movl	-4(%esi,%ebx,4), %eax	C src limb
   222  
   223  	movl	(%esi,%ebx,4), %ebp	C and one above it
   224  
   225  	shrdl(	%cl, %ebp, %eax)
   226  
   227  	subl	%edx, %eax		C carry limb
   228  
   229  	sbbl	%ebp, %ebp
   230  
   231  L(even_entry):
   232  	imull	VAR_INVERSE, %eax
   233  
   234  	movl	%eax, -4(%edi,%ebx,4)
   235  	incl	%ebx
   236  
   237  	jnz	L(even_top)
   238  
   239  
   240  
   241  	mull	PARAM_DIVISOR
   242  
   243  	movl	-4(%esi), %eax		C src high limb
   244  	subl	%ebp, %edx
   245  
   246  	shrl	%cl, %eax
   247  
   248  	subl	%edx, %eax		C no carry if division is exact
   249  
   250  	imull	VAR_INVERSE, %eax
   251  
   252  	movl	%eax, -4(%edi)		C dst high limb
   253  	nop				C protect against cache bank clash
   254  
   255  	popl	%ebp
   256  	popl	%ebx
   257  
   258  	popl	%edi
   259  	popl	%esi
   260  
   261  	ret
   262  
   263  EPILOGUE()
   264  ASM_END()