github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/divrem_1.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/divrem_1.asm (about)

     1  dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
     2  
     3  dnl  Copyright 1999-2004 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
    35  
    36  
    37  C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
    38  C                         mp_srcptr src, mp_size_t size,
    39  C                         mp_limb_t divisor);
    40  C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
    41  C                          mp_srcptr src, mp_size_t size,
    42  C                          mp_limb_t divisor, mp_limb_t carry);
    43  C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
    44  C                                mp_srcptr src, mp_size_t size,
    45  C                                mp_limb_t divisor, mp_limb_t inverse,
    46  C                                unsigned shift);
    47  C
    48  C Algorithm:
    49  C
    50  C The method and nomenclature follow part 8 of "Division by Invariant
    51  C Integers using Multiplication" by Granlund and Montgomery, reference in
    52  C gmp.texi.
    53  C
    54  C "m" is written for what is m' in the paper, and "d" for d_norm, which
    55  C won't cause any confusion since it's only the normalized divisor that's of
    56  C any use in the code.  "b" is written for 2^N, the size of a limb, N being
    57  C 32 here.
    58  C
    59  C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
    60  C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
    61  C us have just a psubq on the dependent chain.
    62  C
    63  C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
    64  C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
    65  C
    66  C Notes:
    67  C
    68  C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
    69  C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
    70  C carry, since in normal circumstances that will be a very rare event.
    71  C
    72  C The test for skipping a division is branch free (once size>=1 is tested).
    73  C The store to the destination high limb is 0 when a divide is skipped, or
    74  C if it's not skipped then a copy of the src high limb is stored.  The
    75  C latter is in case src==dst.
    76  C
    77  C There's a small bias towards expecting xsize==0, by having code for
    78  C xsize==0 in a straight line and xsize!=0 under forward jumps.
    79  C
    80  C Enhancements:
    81  C
    82  C The loop measures 32 cycles, but the dependent chain would suggest it
    83  C could be done with 30.  Not sure where to start looking for the extras.
    84  C
    85  C Alternatives:
    86  C
    87  C If the divisor is normalized (high bit set) then a division step can
    88  C always be skipped, since the high destination limb is always 0 or 1 in
    89  C that case.  It doesn't seem worth checking for this though, since it
    90  C probably occurs infrequently.
    91  
    92  
    93  dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
    94  dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
    95  dnl
    96  dnl  The inverse takes about 80-90 cycles to calculate, but after that the
    97  dnl  multiply is 32 c/l versus division at about 58 c/l.
    98  dnl
    99  dnl  At 4 limbs the div is a touch faster than the mul (and of course
   100  dnl  simpler), so start the mul from 5 limbs.
   101  
   102  deflit(MUL_THRESHOLD, 5)
   103  
   104  
   105  defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
   106  defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
   107  defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
   108  defframe(PARAM_DIVISOR,20)
   109  defframe(PARAM_SIZE,   16)
   110  defframe(PARAM_SRC,    12)
   111  defframe(PARAM_XSIZE,  8)
   112  defframe(PARAM_DST,    4)
   113  
   114  dnl  re-use parameter space
   115  define(SAVE_ESI,`PARAM_SIZE')
   116  define(SAVE_EBP,`PARAM_SRC')
   117  define(SAVE_EDI,`PARAM_DIVISOR')
   118  define(SAVE_EBX,`PARAM_DST')
   119  
   120  	TEXT
   121  
   122  	ALIGN(16)
   123  PROLOGUE(mpn_preinv_divrem_1)
   124  deflit(`FRAME',0)
   125  
   126  	movl	PARAM_SIZE, %ecx
   127  	xorl	%edx, %edx		C carry if can't skip a div
   128  
   129  	movl	%esi, SAVE_ESI
   130  	movl	PARAM_SRC, %esi
   131  
   132  	movl	%ebp, SAVE_EBP
   133  	movl	PARAM_DIVISOR, %ebp
   134  
   135  	movl	%edi, SAVE_EDI
   136  	movl	PARAM_DST, %edi
   137  
   138  	movl	-4(%esi,%ecx,4), %eax	C src high limb
   139  
   140  	movl	%ebx, SAVE_EBX
   141  	movl	PARAM_XSIZE, %ebx
   142  
   143  	movd	PARAM_PREINV_INVERSE, %mm4
   144  
   145  	movd	PARAM_PREINV_SHIFT, %mm7  C l
   146  	cmpl	%ebp, %eax		C high cmp divisor
   147  
   148  	cmovc(	%eax, %edx)		C high is carry if high<divisor
   149  	movd	%edx, %mm0		C carry
   150  
   151  	movd	%edx, %mm1		C carry
   152  	movl	$0, %edx
   153  
   154  	movd	%ebp, %mm5		C d
   155  	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
   156  					C (the latter in case src==dst)
   157  	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
   158  
   159  	movl	%edx, (%edi,%ecx,4)	C dst high limb
   160  	sbbl	$0, %ecx		C skip one division if high<divisor
   161  	movl	$32, %eax
   162  
   163  	subl	PARAM_PREINV_SHIFT, %eax
   164  	psllq	%mm7, %mm5		C d normalized
   165  	leal	(%edi,%ecx,4), %edi	C &dst[xsize+size-1]
   166  	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
   167  
   168  	movd	%eax, %mm6		C 32-l
   169  	jmp	L(start_preinv)
   170  
   171  EPILOGUE()
   172  
   173  
   174  	ALIGN(16)
   175  PROLOGUE(mpn_divrem_1c)
   176  deflit(`FRAME',0)
   177  
   178  	movl	PARAM_CARRY, %edx
   179  
   180  	movl	PARAM_SIZE, %ecx
   181  
   182  	movl	%esi, SAVE_ESI
   183  	movl	PARAM_SRC, %esi
   184  
   185  	movl	%ebp, SAVE_EBP
   186  	movl	PARAM_DIVISOR, %ebp
   187  
   188  	movl	%edi, SAVE_EDI
   189  	movl	PARAM_DST, %edi
   190  
   191  	movl	%ebx, SAVE_EBX
   192  	movl	PARAM_XSIZE, %ebx
   193  
   194  	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
   195  	jmp	L(start_1c)
   196  
   197  EPILOGUE()
   198  
   199  
   200  	ALIGN(16)
   201  PROLOGUE(mpn_divrem_1)
   202  deflit(`FRAME',0)
   203  
   204  	movl	PARAM_SIZE, %ecx
   205  	xorl	%edx, %edx		C initial carry (if can't skip a div)
   206  
   207  	movl	%esi, SAVE_ESI
   208  	movl	PARAM_SRC, %esi
   209  
   210  	movl	%ebp, SAVE_EBP
   211  	movl	PARAM_DIVISOR, %ebp
   212  
   213  	movl	%edi, SAVE_EDI
   214  	movl	PARAM_DST, %edi
   215  
   216  	movl	%ebx, SAVE_EBX
   217  	movl	PARAM_XSIZE, %ebx
   218  	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
   219  
   220  	orl	%ecx, %ecx		C size
   221  	jz	L(no_skip_div)		C if size==0
   222  	movl	-4(%esi,%ecx,4), %eax	C src high limb
   223  
   224  	cmpl	%ebp, %eax		C high cmp divisor
   225  
   226  	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
   227  	movl	%edx, (%edi,%ecx,4)	C dst high limb
   228  
   229  	movl	$0, %edx
   230  	cmovc(	%eax, %edx)		C high is carry if high<divisor
   231  
   232  	sbbl	$0, %ecx		C size-1 if high<divisor
   233  L(no_skip_div):
   234  
   235  
   236  L(start_1c):
   237  	C eax
   238  	C ebx	xsize
   239  	C ecx	size
   240  	C edx	carry
   241  	C esi	src
   242  	C edi	&dst[xsize-1]
   243  	C ebp	divisor
   244  
   245  	leal	(%ebx,%ecx), %eax	C size+xsize
   246  	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
   247  	leal	(%edi,%ecx,4), %edi	C &dst[size+xsize-1]
   248  
   249  	cmpl	$MUL_THRESHOLD, %eax
   250  	jae	L(mul_by_inverse)
   251  
   252  
   253  	orl	%ecx, %ecx
   254  	jz	L(divide_no_integer)	C if size==0
   255  
   256  L(divide_integer):
   257  	C eax	scratch (quotient)
   258  	C ebx	xsize
   259  	C ecx	counter
   260  	C edx	carry
   261  	C esi	src, decrementing
   262  	C edi	dst, decrementing
   263  	C ebp	divisor
   264  
   265  	movl	(%esi), %eax
   266  	subl	$4, %esi
   267  
   268  	divl	%ebp
   269  
   270  	movl	%eax, (%edi)
   271  	subl	$4, %edi
   272  
   273  	subl	$1, %ecx
   274  	jnz	L(divide_integer)
   275  
   276  
   277  L(divide_no_integer):
   278  	orl	%ebx, %ebx
   279  	jnz	L(divide_fraction)	C if xsize!=0
   280  
   281  L(divide_done):
   282  	movl	SAVE_ESI, %esi
   283  	movl	SAVE_EDI, %edi
   284  	movl	SAVE_EBX, %ebx
   285  	movl	SAVE_EBP, %ebp
   286  	movl	%edx, %eax
   287  	ret
   288  
   289  
   290  L(divide_fraction):
   291  	C eax	scratch (quotient)
   292  	C ebx	counter
   293  	C ecx
   294  	C edx	carry
   295  	C esi
   296  	C edi	dst, decrementing
   297  	C ebp	divisor
   298  
   299  	movl	$0, %eax
   300  
   301  	divl	%ebp
   302  
   303  	movl	%eax, (%edi)
   304  	subl	$4, %edi
   305  
   306  	subl	$1, %ebx
   307  	jnz	L(divide_fraction)
   308  
   309  	jmp	L(divide_done)
   310  
   311  
   312  
   313  C -----------------------------------------------------------------------------
   314  
   315  L(mul_by_inverse):
   316  	C eax
   317  	C ebx	xsize
   318  	C ecx	size
   319  	C edx	carry
   320  	C esi	&src[size-1]
   321  	C edi	&dst[size+xsize-1]
   322  	C ebp	divisor
   323  
   324  	bsrl	%ebp, %eax		C 31-l
   325  	movd	%edx, %mm0		C carry
   326  	movd	%edx, %mm1		C carry
   327  	movl	%ecx, %edx		C size
   328  	movl	$31, %ecx
   329  
   330  	C
   331  
   332  	xorl	%eax, %ecx		C l = leading zeros on d
   333  	addl	$1, %eax
   334  
   335  	shll	%cl, %ebp		C d normalized
   336  	movd	%ecx, %mm7		C l
   337  	movl	%edx, %ecx		C size
   338  
   339  	movd	%eax, %mm6		C 32-l
   340  	movl	$-1, %edx
   341  	movl	$-1, %eax
   342  
   343  	C
   344  
   345  	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
   346  
   347  	divl	%ebp			C floor (b*(b-d)-1 / d)
   348  	movd	%ebp, %mm5		C d
   349  
   350  	C
   351  
   352  	movd	%eax, %mm4		C m
   353  
   354  
   355  L(start_preinv):
   356  	C eax	inverse
   357  	C ebx	xsize
   358  	C ecx	size
   359  	C edx
   360  	C esi	&src[size-1]
   361  	C edi	&dst[size+xsize-1]
   362  	C ebp
   363  	C
   364  	C mm0	carry
   365  	C mm1	carry
   366  	C mm2
   367  	C mm4	m
   368  	C mm5	d
   369  	C mm6	31-l
   370  	C mm7	l
   371  
   372  	psllq	%mm7, %mm0		C n2 = carry << l, for size==0
   373  
   374  	subl	$1, %ecx
   375  	jb	L(integer_none)
   376  
   377  	movd	(%esi), %mm0		C src high limb
   378  	punpckldq %mm1, %mm0
   379  	psrlq	%mm6, %mm0		C n2 = high (carry:srchigh << l)
   380  	jz	L(integer_last)
   381  
   382  
   383  C The dependent chain here consists of
   384  C
   385  C	2   paddd    n1+n2
   386  C	8   pmuludq  m*(n1+n2)
   387  C	2   paddq    n2:nadj + m*(n1+n2)
   388  C	2   psrlq    q1
   389  C	8   pmuludq  d*q1
   390  C	2   psubq    (n-d)-q1*d
   391  C	2   psrlq    high n-(q1+1)*d mask
   392  C	2   pand     d masked
   393  C	2   paddd    n2+d addback
   394  C	--
   395  C	30
   396  C
   397  C But it seems to run at 32 cycles, so presumably there's something else
   398  C going on.
   399  
   400  	ALIGN(16)
   401  L(integer_top):
   402  	C eax
   403  	C ebx
   404  	C ecx	counter, size-1 to 0
   405  	C edx
   406  	C esi	src, decrementing
   407  	C edi	dst, decrementing
   408  	C
   409  	C mm0	n2
   410  	C mm4	m
   411  	C mm5	d
   412  	C mm6	32-l
   413  	C mm7	l
   414  
   415  	ASSERT(b,`C n2<d
   416  	 movd	%mm0, %eax
   417  	 movd	%mm5, %edx
   418  	 cmpl	%edx, %eax')
   419  
   420  	movd	-4(%esi), %mm1		C next src limbs
   421  	movd	(%esi), %mm2
   422  	leal	-4(%esi), %esi
   423  
   424  	punpckldq %mm2, %mm1
   425  	psrlq	%mm6, %mm1		C n10
   426  
   427  	movq	%mm1, %mm2		C n10
   428  	movq	%mm1, %mm3		C n10
   429  	psrad	$31, %mm1		C -n1
   430  	pand	%mm5, %mm1		C -n1 & d
   431  	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
   432  
   433  	psrld	$31, %mm2		C n1
   434  	paddd	%mm0, %mm2		C n2+n1
   435  	punpckldq %mm0, %mm1		C n2:nadj
   436  
   437  	pmuludq	%mm4, %mm2		C m*(n2+n1)
   438  
   439  	C
   440  
   441  	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
   442  	pxor	%mm2, %mm2		C break dependency, saves 4 cycles
   443  	pcmpeqd	%mm2, %mm2		C FF...FF
   444  	psrlq	$63, %mm2		C 1
   445  
   446  	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
   447  
   448  	paddd	%mm1, %mm2		C q1+1
   449  	pmuludq	%mm5, %mm1		C q1*d
   450  
   451  	punpckldq %mm0, %mm3		C n = n2:n10
   452  	pxor	%mm0, %mm0
   453  
   454  	psubq	%mm5, %mm3		C n - d
   455  
   456  	C
   457  
   458  	psubq	%mm1, %mm3		C n - (q1+1)*d
   459  
   460  	por	%mm3, %mm0		C copy remainder -> new n2
   461  	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
   462  
   463  	ASSERT(be,`C 0 or -1
   464  	 movd	%mm3, %eax
   465  	 addl	$1, %eax
   466  	 cmpl	$1, %eax')
   467  
   468  	paddd	%mm3, %mm2		C q
   469  	pand	%mm5, %mm3		C mask & d
   470  
   471  	paddd	%mm3, %mm0		C addback if necessary
   472  	movd	%mm2, (%edi)
   473  	leal	-4(%edi), %edi
   474  
   475  	subl	$1, %ecx
   476  	ja	L(integer_top)
   477  
   478  
   479  L(integer_last):
   480  	C eax
   481  	C ebx	xsize
   482  	C ecx
   483  	C edx
   484  	C esi	&src[0]
   485  	C edi	&dst[xsize]
   486  	C
   487  	C mm0	n2
   488  	C mm4	m
   489  	C mm5	d
   490  	C mm6
   491  	C mm7	l
   492  
   493  	ASSERT(b,`C n2<d
   494  	 movd	%mm0, %eax
   495  	 movd	%mm5, %edx
   496  	 cmpl	%edx, %eax')
   497  
   498  	movd	(%esi), %mm1		C src[0]
   499  	psllq	%mm7, %mm1		C n10
   500  
   501  	movq	%mm1, %mm2		C n10
   502  	movq	%mm1, %mm3		C n10
   503  	psrad	$31, %mm1		C -n1
   504  	pand	%mm5, %mm1		C -n1 & d
   505  	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
   506  
   507  	psrld	$31, %mm2		C n1
   508  	paddd	%mm0, %mm2		C n2+n1
   509  	punpckldq %mm0, %mm1		C n2:nadj
   510  
   511  	pmuludq	%mm4, %mm2		C m*(n2+n1)
   512  
   513  	C
   514  
   515  	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
   516  	pcmpeqd	%mm2, %mm2		C FF...FF
   517  	psrlq	$63, %mm2		C 1
   518  
   519  	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
   520  	paddd	%mm1, %mm2		C q1
   521  
   522  	pmuludq	%mm5, %mm1		C q1*d
   523  	punpckldq %mm0, %mm3		C n
   524  	psubq	%mm5, %mm3		C n - d
   525  	pxor	%mm0, %mm0
   526  
   527  	C
   528  
   529  	psubq	%mm1, %mm3		C n - (q1+1)*d
   530  
   531  	por	%mm3, %mm0		C remainder -> n2
   532  	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
   533  
   534  	ASSERT(be,`C 0 or -1
   535  	 movd	%mm3, %eax
   536  	 addl	$1, %eax
   537  	 cmpl	$1, %eax')
   538  
   539  	paddd	%mm3, %mm2		C q
   540  	pand	%mm5, %mm3		C mask & d
   541  
   542  	paddd	%mm3, %mm0		C addback if necessary
   543  	movd	%mm2, (%edi)
   544  	leal	-4(%edi), %edi
   545  
   546  
   547  L(integer_none):
   548  	C eax
   549  	C ebx	xsize
   550  
   551  	orl	%ebx, %ebx
   552  	jnz	L(fraction_some)	C if xsize!=0
   553  
   554  
   555  L(fraction_done):
   556  	movl	SAVE_EBP, %ebp
   557  	psrld	%mm7, %mm0		C remainder
   558  
   559  	movl	SAVE_EDI, %edi
   560  	movd	%mm0, %eax
   561  
   562  	movl	SAVE_ESI, %esi
   563  	movl	SAVE_EBX, %ebx
   564  	emms
   565  	ret
   566  
   567  
   568  
   569  C -----------------------------------------------------------------------------
   570  C
   571  
   572  L(fraction_some):
   573  	C eax
   574  	C ebx	xsize
   575  	C ecx
   576  	C edx
   577  	C esi
   578  	C edi	&dst[xsize-1]
   579  	C ebp
   580  
   581  
   582  L(fraction_top):
   583  	C eax
   584  	C ebx	counter, xsize iterations
   585  	C ecx
   586  	C edx
   587  	C esi	src, decrementing
   588  	C edi	dst, decrementing
   589  	C
   590  	C mm0	n2
   591  	C mm4	m
   592  	C mm5	d
   593  	C mm6	32-l
   594  	C mm7	l
   595  
   596  	ASSERT(b,`C n2<d
   597  	 movd	%mm0, %eax
   598  	 movd	%mm5, %edx
   599  	 cmpl	%edx, %eax')
   600  
   601  	movq	%mm0, %mm1		C n2
   602  	pmuludq	%mm4, %mm0		C m*n2
   603  
   604  	pcmpeqd	%mm2, %mm2
   605  	psrlq	$63, %mm2
   606  
   607  	C
   608  
   609  	psrlq	$32, %mm0		C high(m*n2)
   610  
   611  	paddd	%mm1, %mm0		C q1 = high(n2:0 + m*n2)
   612  
   613  	paddd	%mm0, %mm2		C q1+1
   614  	pmuludq	%mm5, %mm0		C q1*d
   615  
   616  	psllq	$32, %mm1		C n = n2:0
   617  	psubq	%mm5, %mm1		C n - d
   618  
   619  	C
   620  
   621  	psubq	%mm0, %mm1		C r = n - (q1+1)*d
   622  	pxor	%mm0, %mm0
   623  
   624  	por	%mm1, %mm0		C r -> n2
   625  	psrlq	$32, %mm1		C high n - (q1+1)*d, 0 or -1
   626  
   627  	ASSERT(be,`C 0 or -1
   628  	 movd	%mm1, %eax
   629  	 addl	$1, %eax
   630  	 cmpl	$1, %eax')
   631  
   632  	paddd	%mm1, %mm2		C q
   633  	pand	%mm5, %mm1		C mask & d
   634  
   635  	paddd	%mm1, %mm0		C addback if necessary
   636  	movd	%mm2, (%edi)
   637  	leal	-4(%edi), %edi
   638  
   639  	subl	$1, %ebx
   640  	jne	L(fraction_top)
   641  
   642  
   643  	jmp	L(fraction_done)
   644  
   645  EPILOGUE()