github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/bdiv_q_1.asm (about)

     1  dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
     2  
     3  dnl  Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
     4  
     5  dnl  Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C         divisor
    37  C       odd   even
    38  C P54:  24.5  30.5   cycles/limb
    39  C P55:  23.0  28.0
    40  
    41  MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
    42  
    43  C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
    44  C expected.  On P54 in the even case the shrdl pairing nonsense (see
    45  C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
    46  C further 1.5 slowdown for both odd and even.
    47  
    48  defframe(PARAM_SHIFT,  24)
    49  defframe(PARAM_INVERSE,20)
    50  defframe(PARAM_DIVISOR,16)
    51  defframe(PARAM_SIZE,   12)
    52  defframe(PARAM_SRC,    8)
    53  defframe(PARAM_DST,    4)
    54  
    55  dnl  re-use parameter space
    56  define(VAR_INVERSE,`PARAM_DST')
    57  
    58  	TEXT
    59  
    60  	ALIGN(32)
    61  C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    62  C                           mp_limb_t divisor);
    63  C
    64  PROLOGUE(mpn_bdiv_q_1)
    65  deflit(`FRAME',0)
    66  
    67  	movl	$-1, %ecx
    68  	movl	PARAM_DIVISOR, %eax
    69  
    70  L(strip_twos):
    71  	ASSERT(nz, `orl %eax, %eax')
    72  	shrl	%eax
    73  	incl	%ecx			C shift count
    74  
    75  	jnc	L(strip_twos)
    76  
    77  	leal	1(%eax,%eax), %edx	C d
    78  	andl	$127, %eax		C d/2, 7 bits
    79  
    80  	pushl	%ebx		FRAME_pushl()
    81  	pushl	%ebp		FRAME_pushl()
    82  
    83  ifdef(`PIC',`
    84  ifdef(`DARWIN',`
    85  	LEA(	binvert_limb_table, %ebp)
    86  	movzbl	(%eax,%ebp), %eax
    87  ',`
    88  	call	L(here)
    89  L(here):
    90  	popl	%ebp			C eip
    91  
    92  	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
    93  	C AGI
    94  	movl	binvert_limb_table@GOT(%ebp), %ebp
    95  	C AGI
    96  	movzbl	(%eax,%ebp), %eax
    97  ')
    98  ',`
    99  
   100  dnl non-PIC
   101  	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
   102  ')
   103  
   104  	movl	%eax, %ebp		C inv
   105  	addl	%eax, %eax		C 2*inv
   106  
   107  	imull	%ebp, %ebp		C inv*inv
   108  
   109  	imull	%edx, %ebp		C inv*inv*d
   110  
   111  	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
   112  	movl	PARAM_SIZE, %ebx
   113  
   114  	movl	%eax, %ebp
   115  	addl	%eax, %eax		C 2*inv
   116  
   117  	imull	%ebp, %ebp		C inv*inv
   118  
   119  	imull	%edx, %ebp		C inv*inv*d
   120  
   121  	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
   122  	movl	%edx, PARAM_DIVISOR	C d without twos
   123  
   124  	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
   125  	pushl	%eax	FRAME_pushl()
   126  	imull	PARAM_DIVISOR, %eax
   127  	cmpl	$1, %eax
   128  	popl	%eax	FRAME_popl()')
   129  
   130  	jmp	L(common)
   131  EPILOGUE()
   132  
   133  C mp_limb_t
   134  C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
   135  C		    mp_limb_t inverse, int shift)
   136  	ALIGN(32)
   137  PROLOGUE(mpn_pi1_bdiv_q_1)
   138  deflit(`FRAME',0)
   139  
   140  	movl	PARAM_SHIFT, %ecx
   141  
   142  	pushl	%ebx		FRAME_pushl()
   143  	pushl	%ebp		FRAME_pushl()
   144  
   145  	movl	PARAM_SIZE, %ebx
   146  	movl	PARAM_INVERSE, %eax
   147  
   148  L(common):
   149  	pushl	%esi		FRAME_pushl()
   150  	push	%edi		FRAME_pushl()
   151  
   152  	movl	PARAM_SRC, %esi
   153  	movl	PARAM_DST, %edi
   154  	movl	%eax, VAR_INVERSE
   155  
   156  	leal	(%esi,%ebx,4), %esi	C src end
   157  	leal	(%edi,%ebx,4), %edi	C dst end
   158  
   159  	negl	%ebx			C -size
   160  
   161  	xorl	%ebp, %ebp		C initial carry bit
   162  
   163  	orl	%ecx, %ecx		C shift
   164  	movl	(%esi,%ebx,4), %eax	C src low limb
   165  	jz	L(odd_entry)
   166  
   167  	xorl	%edx, %edx		C initial carry limb (for even, if one)
   168  	incl	%ebx
   169  	jz	L(one)
   170  
   171  	movl	(%esi,%ebx,4), %edx	C src second limb (for even)
   172  	shrdl(	%cl, %edx, %eax)
   173  
   174  	jmp	L(even_entry)
   175  
   176  
   177  	ALIGN(8)
   178  L(odd_top):
   179  	C eax	scratch
   180  	C ebx	counter, limbs, negative
   181  	C ecx
   182  	C edx
   183  	C esi	src end
   184  	C edi	dst end
   185  	C ebp	carry bit, 0 or -1
   186  
   187  	mull	PARAM_DIVISOR
   188  
   189  	movl	(%esi,%ebx,4), %eax
   190  	subl	%ebp, %edx
   191  
   192  	subl	%edx, %eax
   193  
   194  	sbbl	%ebp, %ebp
   195  
   196  L(odd_entry):
   197  	imull	VAR_INVERSE, %eax
   198  
   199  	movl	%eax, (%edi,%ebx,4)
   200  
   201  	incl	%ebx
   202  	jnz	L(odd_top)
   203  
   204  	popl	%edi
   205  	popl	%esi
   206  
   207  	popl	%ebp
   208  	popl	%ebx
   209  
   210  	ret
   211  
   212  L(even_top):
   213  	C eax	scratch
   214  	C ebx	counter, limbs, negative
   215  	C ecx	twos
   216  	C edx
   217  	C esi	src end
   218  	C edi	dst end
   219  	C ebp	carry bit, 0 or -1
   220  
   221  	mull	PARAM_DIVISOR
   222  
   223  	subl	%ebp, %edx		C carry bit
   224  	movl	-4(%esi,%ebx,4), %eax	C src limb
   225  
   226  	movl	(%esi,%ebx,4), %ebp	C and one above it
   227  
   228  	shrdl(	%cl, %ebp, %eax)
   229  
   230  	subl	%edx, %eax		C carry limb
   231  
   232  	sbbl	%ebp, %ebp
   233  
   234  L(even_entry):
   235  	imull	VAR_INVERSE, %eax
   236  
   237  	movl	%eax, -4(%edi,%ebx,4)
   238  	incl	%ebx
   239  
   240  	jnz	L(even_top)
   241  
   242  	mull	PARAM_DIVISOR
   243  
   244  	movl	-4(%esi), %eax		C src high limb
   245  	subl	%ebp, %edx
   246  
   247  L(one):
   248  	shrl	%cl, %eax
   249  
   250  	subl	%edx, %eax		C no carry if division is exact
   251  
   252  	imull	VAR_INVERSE, %eax
   253  
   254  	movl	%eax, -4(%edi)		C dst high limb
   255  	nop				C protect against cache bank clash
   256  
   257  	popl	%edi
   258  	popl	%esi
   259  
   260  	popl	%ebp
   261  	popl	%ebx
   262  
   263  	ret
   264  
   265  EPILOGUE()
   266  ASM_END()