github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/divrem_1.asm (about)

     1  dnl  AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
     2  dnl  division.
     3  
     4  dnl  Copyright 1999-2002, 2004 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
    36  
    37  
    38  C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
    39  C                         mp_srcptr src, mp_size_t size,
    40  C                         mp_limb_t divisor);
    41  C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
    42  C                          mp_srcptr src, mp_size_t size,
    43  C                          mp_limb_t divisor, mp_limb_t carry);
    44  C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
    45  C                                mp_srcptr src, mp_size_t size,
    46  C                                mp_limb_t divisor, mp_limb_t inverse,
    47  C                                unsigned shift);
    48  C
    49  C Algorithm:
    50  C
    51  C The method and nomenclature follow part 8 of "Division by Invariant
    52  C Integers using Multiplication" by Granlund and Montgomery, reference in
    53  C gmp.texi.
    54  C
    55  C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
    56  C for m', and "d" for d_norm, which won't cause any confusion since it's
    57  C only the normalized divisor that's of any use in the code.  "b" is written
    58  C for 2^N, the size of a limb, N being 32 here.
    59  C
    60  C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
    61  C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer.  If
    62  C q1==0xFFFFFFFF, then q1+1 would overflow.  We branch to a special case
    63  C "q1_ff" if this occurs.  Since the true quotient is either q1 or q1+1 then
    64  C if q1==0xFFFFFFFF that must be the right value.
    65  C
    66  C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
    67  C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1.  This
    68  C then goes through as normal, and finding no addback required.  sbbl costs
    69  C an extra cycle over what the main loop code does, but it keeps code size
    70  C and complexity down.
    71  C
    72  C Notes:
    73  C
    74  C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
    75  C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
    76  C carry, since in normal circumstances that will be a very rare event.
    77  C
    78  C The test for skipping a division is branch free (once size>=1 is tested).
    79  C The store to the destination high limb is 0 when a divide is skipped, or
    80  C if it's not skipped then a copy of the src high limb is used.  The latter
    81  C is in case src==dst.
    82  C
    83  C There's a small bias towards expecting xsize==0, by having code for
    84  C xsize==0 in a straight line and xsize!=0 under forward jumps.
    85  C
    86  C Alternatives:
    87  C
    88  C If the divisor is normalized (high bit set) then a division step can
    89  C always be skipped, since the high destination limb is always 0 or 1 in
    90  C that case.  It doesn't seem worth checking for this though, since it
    91  C probably occurs infrequently, in particular note that big_base for a
    92  C decimal mpn_get_str is not normalized in a 32-bit limb.
    93  
    94  
    95  dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
    96  dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
    97  dnl
    98  dnl  The inverse takes about 50 cycles to calculate, but after that the
    99  dnl  multiply is 17 c/l versus division at 42 c/l.
   100  dnl
   101  dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
   102  dnl  even more so on the fractional part.
   103  
   104  deflit(MUL_THRESHOLD, 3)
   105  
   106  
   107  defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
   108  defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
   109  defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
   110  defframe(PARAM_DIVISOR,20)
   111  defframe(PARAM_SIZE,   16)
   112  defframe(PARAM_SRC,    12)
   113  defframe(PARAM_XSIZE,  8)
   114  defframe(PARAM_DST,    4)
   115  
   116  defframe(SAVE_EBX,    -4)
   117  defframe(SAVE_ESI,    -8)
   118  defframe(SAVE_EDI,    -12)
   119  defframe(SAVE_EBP,    -16)
   120  
   121  defframe(VAR_NORM,    -20)
   122  defframe(VAR_INVERSE, -24)
   123  defframe(VAR_SRC,     -28)
   124  defframe(VAR_DST,     -32)
   125  defframe(VAR_DST_STOP,-36)
   126  
   127  deflit(STACK_SPACE, 36)
   128  
   129  	TEXT
   130  	ALIGN(32)
   131  
   132  PROLOGUE(mpn_preinv_divrem_1)
   133  deflit(`FRAME',0)
   134  	movl	PARAM_XSIZE, %ecx
   135  	movl	PARAM_DST, %edx
   136  	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
   137  
   138  	movl	%esi, SAVE_ESI
   139  	movl	PARAM_SRC, %esi
   140  
   141  	movl	%ebx, SAVE_EBX
   142  	movl	PARAM_SIZE, %ebx
   143  
   144  	leal	8(%edx,%ecx,4), %edx	C &dst[xsize+2]
   145  	movl	%ebp, SAVE_EBP
   146  	movl	PARAM_DIVISOR, %ebp
   147  
   148  	movl	%edx, VAR_DST_STOP	C &dst[xsize+2]
   149  	movl	%edi, SAVE_EDI
   150  	xorl	%edi, %edi		C carry
   151  
   152  	movl	-4(%esi,%ebx,4), %eax	C src high limb
   153  	xor	%ecx, %ecx
   154  
   155  	C
   156  
   157  	C
   158  
   159  	cmpl	%ebp, %eax		C high cmp divisor
   160  
   161  	cmovc(	%eax, %edi)		C high is carry if high<divisor
   162  	cmovnc(	%eax, %ecx)		C 0 if skip div, src high if not
   163  					C (the latter in case src==dst)
   164  
   165  	movl	%ecx, -12(%edx,%ebx,4)	C dst high limb
   166  	sbbl	$0, %ebx		C skip one division if high<divisor
   167  	movl	PARAM_PREINV_SHIFT, %ecx
   168  
   169  	leal	-8(%edx,%ebx,4), %edx	C &dst[xsize+size]
   170  	movl	$32, %eax
   171  
   172  	movl	%edx, VAR_DST		C &dst[xsize+size]
   173  
   174  	shll	%cl, %ebp		C d normalized
   175  	subl	%ecx, %eax
   176  	movl	%ecx, VAR_NORM
   177  
   178  	movd	%eax, %mm7		C rshift
   179  	movl	PARAM_PREINV_INVERSE, %eax
   180  	jmp	L(start_preinv)
   181  
   182  EPILOGUE()
   183  
   184  
   185  	ALIGN(16)
   186  
   187  PROLOGUE(mpn_divrem_1c)
   188  deflit(`FRAME',0)
   189  	movl	PARAM_CARRY, %edx
   190  	movl	PARAM_SIZE, %ecx
   191  	subl	$STACK_SPACE, %esp
   192  deflit(`FRAME',STACK_SPACE)
   193  
   194  	movl	%ebx, SAVE_EBX
   195  	movl	PARAM_XSIZE, %ebx
   196  
   197  	movl	%edi, SAVE_EDI
   198  	movl	PARAM_DST, %edi
   199  
   200  	movl	%ebp, SAVE_EBP
   201  	movl	PARAM_DIVISOR, %ebp
   202  
   203  	movl	%esi, SAVE_ESI
   204  	movl	PARAM_SRC, %esi
   205  
   206  	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
   207  	jmp	L(start_1c)
   208  
   209  EPILOGUE()
   210  
   211  
   212  	C offset 0xa1, close enough to aligned
   213  PROLOGUE(mpn_divrem_1)
   214  deflit(`FRAME',0)
   215  
   216  	movl	PARAM_SIZE, %ecx
   217  	movl	$0, %edx		C initial carry (if can't skip a div)
   218  	subl	$STACK_SPACE, %esp
   219  deflit(`FRAME',STACK_SPACE)
   220  
   221  	movl	%esi, SAVE_ESI
   222  	movl	PARAM_SRC, %esi
   223  
   224  	movl	%ebx, SAVE_EBX
   225  	movl	PARAM_XSIZE, %ebx
   226  
   227  	movl	%ebp, SAVE_EBP
   228  	movl	PARAM_DIVISOR, %ebp
   229  	orl	%ecx, %ecx		C size
   230  
   231  	movl	%edi, SAVE_EDI
   232  	movl	PARAM_DST, %edi
   233  	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
   234  
   235  	jz	L(no_skip_div)		C if size==0
   236  	movl	-4(%esi,%ecx,4), %eax	C src high limb
   237  	xorl	%esi, %esi
   238  
   239  	cmpl	%ebp, %eax		C high cmp divisor
   240  
   241  	cmovc(	%eax, %edx)		C high is carry if high<divisor
   242  	cmovnc(	%eax, %esi)		C 0 if skip div, src high if not
   243  
   244  	movl	%esi, (%edi,%ecx,4)	C dst high limb
   245  	sbbl	$0, %ecx		C size-1 if high<divisor
   246  	movl	PARAM_SRC, %esi		C reload
   247  L(no_skip_div):
   248  
   249  
   250  L(start_1c):
   251  	C eax
   252  	C ebx	xsize
   253  	C ecx	size
   254  	C edx	carry
   255  	C esi	src
   256  	C edi	&dst[xsize-1]
   257  	C ebp	divisor
   258  
   259  	leal	(%ebx,%ecx), %eax	C size+xsize
   260  	cmpl	$MUL_THRESHOLD, %eax
   261  	jae	L(mul_by_inverse)
   262  
   263  
   264  C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
   265  C It'd be possible to write them out without the looping, but no speedup
   266  C would be expected.
   267  C
   268  C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
   269  C integer part, but curiously not on the fractional part, where %ebp is a
   270  C (fixed) couple of cycles faster.
   271  
   272  	orl	%ecx, %ecx
   273  	jz	L(divide_no_integer)
   274  
   275  L(divide_integer):
   276  	C eax	scratch (quotient)
   277  	C ebx	xsize
   278  	C ecx	counter
   279  	C edx	scratch (remainder)
   280  	C esi	src
   281  	C edi	&dst[xsize-1]
   282  	C ebp	divisor
   283  
   284  	movl	-4(%esi,%ecx,4), %eax
   285  
   286  	divl	PARAM_DIVISOR
   287  
   288  	movl	%eax, (%edi,%ecx,4)
   289  	decl	%ecx
   290  	jnz	L(divide_integer)
   291  
   292  
   293  L(divide_no_integer):
   294  	movl	PARAM_DST, %edi
   295  	orl	%ebx, %ebx
   296  	jnz	L(divide_fraction)
   297  
   298  L(divide_done):
   299  	movl	SAVE_ESI, %esi
   300  	movl	SAVE_EDI, %edi
   301  	movl	%edx, %eax
   302  
   303  	movl	SAVE_EBX, %ebx
   304  	movl	SAVE_EBP, %ebp
   305  	addl	$STACK_SPACE, %esp
   306  
   307  	ret
   308  
   309  
   310  L(divide_fraction):
   311  	C eax	scratch (quotient)
   312  	C ebx	counter
   313  	C ecx
   314  	C edx	scratch (remainder)
   315  	C esi
   316  	C edi	dst
   317  	C ebp	divisor
   318  
   319  	movl	$0, %eax
   320  
   321  	divl	%ebp
   322  
   323  	movl	%eax, -4(%edi,%ebx,4)
   324  	decl	%ebx
   325  	jnz	L(divide_fraction)
   326  
   327  	jmp	L(divide_done)
   328  
   329  
   330  
   331  C -----------------------------------------------------------------------------
   332  
   333  L(mul_by_inverse):
   334  	C eax
   335  	C ebx	xsize
   336  	C ecx	size
   337  	C edx	carry
   338  	C esi	src
   339  	C edi	&dst[xsize-1]
   340  	C ebp	divisor
   341  
   342  	bsrl	%ebp, %eax		C 31-l
   343  
   344  	leal	12(%edi), %ebx		C &dst[xsize+2], loop dst stop
   345  	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
   346  
   347  	movl	%edi, VAR_DST
   348  	movl	%ebx, VAR_DST_STOP
   349  
   350  	movl	%ecx, %ebx		C size
   351  	movl	$31, %ecx
   352  
   353  	movl	%edx, %edi		C carry
   354  	movl	$-1, %edx
   355  
   356  	C
   357  
   358  	xorl	%eax, %ecx		C l
   359  	incl	%eax			C 32-l
   360  
   361  	shll	%cl, %ebp		C d normalized
   362  	movl	%ecx, VAR_NORM
   363  
   364  	movd	%eax, %mm7
   365  
   366  	movl	$-1, %eax
   367  	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
   368  
   369  	divl	%ebp			C floor (b*(b-d)-1) / d
   370  
   371  L(start_preinv):
   372  	C eax	inverse
   373  	C ebx	size
   374  	C ecx	shift
   375  	C edx
   376  	C esi	src
   377  	C edi	carry
   378  	C ebp	divisor
   379  	C
   380  	C mm7	rshift
   381  
   382  	orl	%ebx, %ebx		C size
   383  	movl	%eax, VAR_INVERSE
   384  	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
   385  
   386  	jz	L(start_zero)
   387  	movl	%eax, VAR_SRC
   388  	cmpl	$1, %ebx
   389  
   390  	movl	8(%eax), %esi		C src high limb
   391  	jz	L(start_one)
   392  
   393  L(start_two_or_more):
   394  	movl	4(%eax), %edx		C src second highest limb
   395  
   396  	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
   397  
   398  	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
   399  
   400  	cmpl	$2, %ebx
   401  	je	L(integer_two_left)
   402  	jmp	L(integer_top)
   403  
   404  
   405  L(start_one):
   406  	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
   407  
   408  	shll	%cl, %esi		C n10 = high << l
   409  	movl	%eax, VAR_SRC
   410  	jmp	L(integer_one_left)
   411  
   412  
   413  L(start_zero):
   414  	C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
   415  	C skipped a division.
   416  
   417  	shll	%cl, %edi		C n2 = carry << l
   418  	movl	%edi, %eax		C return value for zero_done
   419  	cmpl	$0, PARAM_XSIZE
   420  
   421  	je	L(zero_done)
   422  	jmp	L(fraction_some)
   423  
   424  
   425  
   426  C -----------------------------------------------------------------------------
   427  C
   428  C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
   429  C execution.  The instruction scheduling is important, with various
   430  C apparently equivalent forms running 1 to 5 cycles slower.
   431  C
   432  C A lower bound for the time would seem to be 16 cycles, based on the
   433  C following successive dependencies.
   434  C
   435  C		      cycles
   436  C		n2+n1	1
   437  C		mul	6
   438  C		q1+1	1
   439  C		mul	6
   440  C		sub	1
   441  C		addback	1
   442  C		       ---
   443  C		       16
   444  C
   445  C This chain is what the loop has already, but 16 cycles isn't achieved.
   446  C K7 has enough decode, and probably enough execute (depending maybe on what
   447  C a mul actually consumes), but nothing running under 17 has been found.
   448  C
   449  C In theory n2+n1 could be done in the sub and addback stages (by
   450  C calculating both n2 and n2+n1 there), but lack of registers makes this an
   451  C unlikely proposition.
   452  C
   453  C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
   454  C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
   455  C chain, and nothing better than 18 cycles has been found when using it.
   456  C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
   457  C be an extremely rare event.
   458  C
   459  C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
   460  C if some special data is coming out with this always, the q1_ff special
   461  C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
   462  C induce the q1_ff case, for speed measurements or testing.  Note that
   463  C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
   464  C
   465  C The instruction groupings and empty comments show the cycles for a naive
   466  C in-order view of the code (conveniently ignoring the load latency on
   467  C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
   468  C to the extent that out-of-order execution rearranges it.  In this case
   469  C there's 19 cycles shown, but it executes at 17.
   470  
   471  	ALIGN(16)
   472  L(integer_top):
   473  	C eax	scratch
   474  	C ebx	scratch (nadj, q1)
   475  	C ecx	scratch (src, dst)
   476  	C edx	scratch
   477  	C esi	n10
   478  	C edi	n2
   479  	C ebp	divisor
   480  	C
   481  	C mm0	scratch (src qword)
   482  	C mm7	rshift for normalization
   483  
   484  	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
   485  	movl	%edi, %eax         C n2
   486  	movl	VAR_SRC, %ecx
   487  
   488  	leal	(%ebp,%esi), %ebx
   489  	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
   490  	sbbl	$-1, %eax          C n2+n1
   491  
   492  	mull	VAR_INVERSE        C m*(n2+n1)
   493  
   494  	movq	(%ecx), %mm0       C next limb and the one below it
   495  	subl	$4, %ecx
   496  
   497  	movl	%ecx, VAR_SRC
   498  
   499  	C
   500  
   501  	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
   502  	leal	1(%edi), %ebx      C n2+1
   503  	movl	%ebp, %eax	   C d
   504  
   505  	C
   506  
   507  	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
   508  	jz	L(q1_ff)
   509  	movl	VAR_DST, %ecx
   510  
   511  	mull	%ebx		   C (q1+1)*d
   512  
   513  	psrlq	%mm7, %mm0
   514  
   515  	leal	-4(%ecx), %ecx
   516  
   517  	C
   518  
   519  	subl	%eax, %esi
   520  	movl	VAR_DST_STOP, %eax
   521  
   522  	C
   523  
   524  	sbbl	%edx, %edi	   C n - (q1+1)*d
   525  	movl	%esi, %edi	   C remainder -> n2
   526  	leal	(%ebp,%esi), %edx
   527  
   528  	movd	%mm0, %esi
   529  
   530  	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
   531  	sbbl	$0, %ebx	   C q
   532  	cmpl	%eax, %ecx
   533  
   534  	movl	%ebx, (%ecx)
   535  	movl	%ecx, VAR_DST
   536  	jne	L(integer_top)
   537  
   538  
   539  L(integer_loop_done):
   540  
   541  
   542  C -----------------------------------------------------------------------------
   543  C
   544  C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
   545  C q1_ff special case.  This make the code a bit smaller and simpler, and
   546  C costs only 1 cycle (each).
   547  
   548  L(integer_two_left):
   549  	C eax	scratch
   550  	C ebx	scratch (nadj, q1)
   551  	C ecx	scratch (src, dst)
   552  	C edx	scratch
   553  	C esi	n10
   554  	C edi	n2
   555  	C ebp	divisor
   556  	C
   557  	C mm7	rshift
   558  
   559  	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
   560  	movl	%edi, %eax         C n2
   561  	movl	PARAM_SRC, %ecx
   562  
   563  	leal	(%ebp,%esi), %ebx
   564  	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
   565  	sbbl	$-1, %eax          C n2+n1
   566  
   567  	mull	VAR_INVERSE        C m*(n2+n1)
   568  
   569  	movd	(%ecx), %mm0	   C src low limb
   570  
   571  	movl	VAR_DST_STOP, %ecx
   572  
   573  	C
   574  
   575  	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
   576  	leal	1(%edi), %ebx      C n2+1
   577  	movl	%ebp, %eax	   C d
   578  
   579  	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
   580  
   581  	sbbl	$0, %ebx
   582  
   583  	mull	%ebx		   C (q1+1)*d
   584  
   585  	psllq	$32, %mm0
   586  
   587  	psrlq	%mm7, %mm0
   588  
   589  	C
   590  
   591  	subl	%eax, %esi
   592  
   593  	C
   594  
   595  	sbbl	%edx, %edi	   C n - (q1+1)*d
   596  	movl	%esi, %edi	   C remainder -> n2
   597  	leal	(%ebp,%esi), %edx
   598  
   599  	movd	%mm0, %esi
   600  
   601  	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
   602  	sbbl	$0, %ebx	   C q
   603  
   604  	movl	%ebx, -4(%ecx)
   605  
   606  
   607  C -----------------------------------------------------------------------------
   608  L(integer_one_left):
   609  	C eax	scratch
   610  	C ebx	scratch (nadj, q1)
   611  	C ecx	dst
   612  	C edx	scratch
   613  	C esi	n10
   614  	C edi	n2
   615  	C ebp	divisor
   616  	C
   617  	C mm7	rshift
   618  
   619  	movl	VAR_DST_STOP, %ecx
   620  	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
   621  	movl	%edi, %eax         C n2
   622  
   623  	leal	(%ebp,%esi), %ebx
   624  	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
   625  	sbbl	$-1, %eax          C n2+n1
   626  
   627  	mull	VAR_INVERSE        C m*(n2+n1)
   628  
   629  	C
   630  
   631  	C
   632  
   633  	C
   634  
   635  	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
   636  	leal	1(%edi), %ebx      C n2+1
   637  	movl	%ebp, %eax	   C d
   638  
   639  	C
   640  
   641  	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
   642  
   643  	sbbl	$0, %ebx           C q1 if q1+1 overflowed
   644  
   645  	mull	%ebx
   646  
   647  	C
   648  
   649  	C
   650  
   651  	C
   652  
   653  	subl	%eax, %esi
   654  
   655  	C
   656  
   657  	sbbl	%edx, %edi	   C n - (q1+1)*d
   658  	movl	%esi, %edi	   C remainder -> n2
   659  	leal	(%ebp,%esi), %edx
   660  
   661  	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
   662  	sbbl	$0, %ebx	   C q
   663  
   664  	movl	%ebx, -8(%ecx)
   665  	subl	$8, %ecx
   666  
   667  
   668  
   669  L(integer_none):
   670  	cmpl	$0, PARAM_XSIZE
   671  	jne	L(fraction_some)
   672  
   673  	movl	%edi, %eax
   674  L(fraction_done):
   675  	movl	VAR_NORM, %ecx
   676  L(zero_done):
   677  	movl	SAVE_EBP, %ebp
   678  
   679  	movl	SAVE_EDI, %edi
   680  	movl	SAVE_ESI, %esi
   681  
   682  	movl	SAVE_EBX, %ebx
   683  	addl	$STACK_SPACE, %esp
   684  
   685  	shrl	%cl, %eax
   686  	emms
   687  
   688  	ret
   689  
   690  
   691  C -----------------------------------------------------------------------------
   692  C
   693  C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
   694  C of q*d is simply -d and the remainder n-q*d = n10+d
   695  
   696  L(q1_ff):
   697  	C eax	(divisor)
   698  	C ebx	(q1+1 == 0)
   699  	C ecx
   700  	C edx
   701  	C esi	n10
   702  	C edi	n2
   703  	C ebp	divisor
   704  
   705  	movl	VAR_DST, %ecx
   706  	movl	VAR_DST_STOP, %edx
   707  	subl	$4, %ecx
   708  
   709  	psrlq	%mm7, %mm0
   710  	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
   711  	movl	%ecx, VAR_DST
   712  
   713  	movd	%mm0, %esi		C next n10
   714  
   715  	movl	$-1, (%ecx)
   716  	cmpl	%ecx, %edx
   717  	jne	L(integer_top)
   718  
   719  	jmp	L(integer_loop_done)
   720  
   721  
   722  
   723  C -----------------------------------------------------------------------------
   724  C
   725  C Being the fractional part, the "source" limbs are all zero, meaning
   726  C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
   727  C
   728  C The loop runs at 15 cycles.  The dependent chain is the same as the
   729  C general case above, but without the n2+n1 stage (due to n1==0), so 15
   730  C would seem to be the lower bound.
   731  C
   732  C A not entirely obvious simplification is that q1+1 never overflows a limb,
   733  C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
   734  C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
   735  C rnd() means rounding down to a multiple of d.
   736  C
   737  C	m*n2 + b*n2 <= m*(d-1) + b*(d-1)
   738  C		     = m*d + b*d - m - b
   739  C		     = floor((b(b-d)-1)/d)*d + b*d - m - b
   740  C		     = rnd(b(b-d)-1) + b*d - m - b
   741  C		     = rnd(b(b-d)-1 + b*d) - m - b
   742  C		     = rnd(b*b-1) - m - b
   743  C		     <= (b-2)*b
   744  C
   745  C Unchanged from the general case is that the final quotient limb q can be
   746  C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
   747  C equation 8.4 of the paper which simplifies as follows when n1==0 and
   748  C n0==0.
   749  C
   750  C	n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
   751  C
   752  C As before, the instruction groupings and empty comments show a naive
   753  C in-order view of the code, which is made a nonsense by out of order
   754  C execution.  There's 17 cycles shown, but it executes at 15.
   755  C
   756  C Rotating the store q and remainder->n2 instructions up to the top of the
   757  C loop gets the run time down from 16 to 15.
   758  
   759  	ALIGN(16)
   760  L(fraction_some):
   761  	C eax
   762  	C ebx
   763  	C ecx
   764  	C edx
   765  	C esi
   766  	C edi	carry
   767  	C ebp	divisor
   768  
   769  	movl	PARAM_DST, %esi
   770  	movl	VAR_DST_STOP, %ecx	C &dst[xsize+2]
   771  	movl	%edi, %eax
   772  
   773  	subl	$8, %ecx		C &dst[xsize]
   774  	jmp	L(fraction_entry)
   775  
   776  
   777  	ALIGN(16)
   778  L(fraction_top):
   779  	C eax	n2 carry, then scratch
   780  	C ebx	scratch (nadj, q1)
   781  	C ecx	dst, decrementing
   782  	C edx	scratch
   783  	C esi	dst stop point
   784  	C edi	(will be n2)
   785  	C ebp	divisor
   786  
   787  	movl	%ebx, (%ecx)	C previous q
   788  	movl	%eax, %edi	C remainder->n2
   789  
   790  L(fraction_entry):
   791  	mull	VAR_INVERSE	C m*n2
   792  
   793  	movl	%ebp, %eax	C d
   794  	subl	$4, %ecx	C dst
   795  	leal	1(%edi), %ebx
   796  
   797  	C
   798  
   799  	C
   800  
   801  	C
   802  
   803  	C
   804  
   805  	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
   806  
   807  	mull	%ebx		C (q1+1)*d
   808  
   809  	C
   810  
   811  	C
   812  
   813  	C
   814  
   815  	negl	%eax		C low of n - (q1+1)*d
   816  
   817  	C
   818  
   819  	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
   820  	leal	(%ebp,%eax), %edx
   821  
   822  	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
   823  	sbbl	$0, %ebx	C q
   824  	cmpl	%esi, %ecx
   825  
   826  	jne	L(fraction_top)
   827  
   828  
   829  	movl	%ebx, (%ecx)
   830  	jmp	L(fraction_done)
   831  
   832  EPILOGUE()