github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/mmx/divrem_1.asm (about)

     1  dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
     2  
     3  dnl  Copyright 1999-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
    35  
    36  
    37  C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
    38  C                         mp_srcptr src, mp_size_t size,
    39  C                         mp_limb_t divisor);
    40  C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
    41  C                          mp_srcptr src, mp_size_t size,
    42  C                          mp_limb_t divisor, mp_limb_t carry);
    43  C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
    44  C                                mp_srcptr src, mp_size_t size,
    45  C                                mp_limb_t divisor, mp_limb_t inverse,
    46  C                                unsigned shift);
    47  C
    48  C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
    49  C see that file for some comments.  It's possible what's here can be improved.
    50  
    51  
    52  dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
    53  dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
    54  dnl
    55  dnl  The different speeds of the integer and fraction parts means that using
    56  dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
    57  dnl  for the integer part and a bit lower for the fraction part.  (Or what's
    58  dnl  really wanted is to speed up the integer part!)
    59  dnl
    60  dnl  The threshold is set to make the integer part right.  At 4 limbs the
    61  dnl  div and mul are about the same there, but on the fractional part the
    62  dnl  mul is much faster.
    63  
    64  deflit(MUL_THRESHOLD, 4)
    65  
    66  
    67  defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
    68  defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
    69  defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
    70  defframe(PARAM_DIVISOR,20)
    71  defframe(PARAM_SIZE,   16)
    72  defframe(PARAM_SRC,    12)
    73  defframe(PARAM_XSIZE,  8)
    74  defframe(PARAM_DST,    4)
    75  
    76  defframe(SAVE_EBX,    -4)
    77  defframe(SAVE_ESI,    -8)
    78  defframe(SAVE_EDI,    -12)
    79  defframe(SAVE_EBP,    -16)
    80  
    81  defframe(VAR_NORM,    -20)
    82  defframe(VAR_INVERSE, -24)
    83  defframe(VAR_SRC,     -28)
    84  defframe(VAR_DST,     -32)
    85  defframe(VAR_DST_STOP,-36)
    86  
    87  deflit(STACK_SPACE, 36)
    88  
    89  	TEXT
    90  	ALIGN(16)
    91  
    92  PROLOGUE(mpn_preinv_divrem_1)
    93  deflit(`FRAME',0)
    94  	movl	PARAM_XSIZE, %ecx
    95  	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
    96  
    97  	movl	%esi, SAVE_ESI
    98  	movl	PARAM_SRC, %esi
    99  
   100  	movl	%ebx, SAVE_EBX
   101  	movl	PARAM_SIZE, %ebx
   102  
   103  	movl	%ebp, SAVE_EBP
   104  	movl	PARAM_DIVISOR, %ebp
   105  
   106  	movl	%edi, SAVE_EDI
   107  	movl	PARAM_DST, %edx
   108  
   109  	movl	-4(%esi,%ebx,4), %eax	C src high limb
   110  	xorl	%edi, %edi		C initial carry (if can't skip a div)
   111  
   112  	C
   113  
   114  	leal	8(%edx,%ecx,4), %edx	C &dst[xsize+2]
   115  	xor	%ecx, %ecx
   116  
   117  	movl	%edx, VAR_DST_STOP	C &dst[xsize+2]
   118  	cmpl	%ebp, %eax		C high cmp divisor
   119  
   120  	cmovc(	%eax, %edi)		C high is carry if high<divisor
   121  
   122  	cmovnc(	%eax, %ecx)		C 0 if skip div, src high if not
   123  					C (the latter in case src==dst)
   124  
   125  	movl	%ecx, -12(%edx,%ebx,4)	C dst high limb
   126  
   127  	sbbl	$0, %ebx		C skip one division if high<divisor
   128  	movl	PARAM_PREINV_SHIFT, %ecx
   129  
   130  	leal	-8(%edx,%ebx,4), %edx	C &dst[xsize+size]
   131  	movl	$32, %eax
   132  
   133  	movl	%edx, VAR_DST		C &dst[xsize+size]
   134  
   135  	shll	%cl, %ebp		C d normalized
   136  	subl	%ecx, %eax
   137  	movl	%ecx, VAR_NORM
   138  
   139  	movd	%eax, %mm7		C rshift
   140  	movl	PARAM_PREINV_INVERSE, %eax
   141  	jmp	L(start_preinv)
   142  
   143  EPILOGUE()
   144  
   145  
   146  
   147  	ALIGN(16)
   148  
   149  PROLOGUE(mpn_divrem_1c)
   150  deflit(`FRAME',0)
   151  	movl	PARAM_CARRY, %edx
   152  
   153  	movl	PARAM_SIZE, %ecx
   154  	subl	$STACK_SPACE, %esp
   155  deflit(`FRAME',STACK_SPACE)
   156  
   157  	movl	%ebx, SAVE_EBX
   158  	movl	PARAM_XSIZE, %ebx
   159  
   160  	movl	%edi, SAVE_EDI
   161  	movl	PARAM_DST, %edi
   162  
   163  	movl	%ebp, SAVE_EBP
   164  	movl	PARAM_DIVISOR, %ebp
   165  
   166  	movl	%esi, SAVE_ESI
   167  	movl	PARAM_SRC, %esi
   168  
   169  	leal	-4(%edi,%ebx,4), %edi
   170  	jmp	L(start_1c)
   171  
   172  EPILOGUE()
   173  
   174  
   175  	C offset 0x31, close enough to aligned
   176  PROLOGUE(mpn_divrem_1)
   177  deflit(`FRAME',0)
   178  
   179  	movl	PARAM_SIZE, %ecx
   180  	movl	$0, %edx		C initial carry (if can't skip a div)
   181  	subl	$STACK_SPACE, %esp
   182  deflit(`FRAME',STACK_SPACE)
   183  
   184  	movl	%ebp, SAVE_EBP
   185  	movl	PARAM_DIVISOR, %ebp
   186  
   187  	movl	%ebx, SAVE_EBX
   188  	movl	PARAM_XSIZE, %ebx
   189  
   190  	movl	%esi, SAVE_ESI
   191  	movl	PARAM_SRC, %esi
   192  	orl	%ecx, %ecx		C size
   193  
   194  	movl	%edi, SAVE_EDI
   195  	movl	PARAM_DST, %edi
   196  
   197  	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
   198  	jz	L(no_skip_div)		C if size==0
   199  
   200  	movl	-4(%esi,%ecx,4), %eax	C src high limb
   201  	xorl	%esi, %esi
   202  	cmpl	%ebp, %eax		C high cmp divisor
   203  
   204  	cmovc(	%eax, %edx)		C high is carry if high<divisor
   205  
   206  	cmovnc(	%eax, %esi)		C 0 if skip div, src high if not
   207  					C (the latter in case src==dst)
   208  
   209  	movl	%esi, (%edi,%ecx,4)	C dst high limb
   210  
   211  	sbbl	$0, %ecx		C size-1 if high<divisor
   212  	movl	PARAM_SRC, %esi		C reload
   213  L(no_skip_div):
   214  
   215  
   216  L(start_1c):
   217  	C eax
   218  	C ebx	xsize
   219  	C ecx	size
   220  	C edx	carry
   221  	C esi	src
   222  	C edi	&dst[xsize-1]
   223  	C ebp	divisor
   224  
   225  	leal	(%ebx,%ecx), %eax	C size+xsize
   226  	cmpl	$MUL_THRESHOLD, %eax
   227  	jae	L(mul_by_inverse)
   228  
   229  	orl	%ecx, %ecx
   230  	jz	L(divide_no_integer)
   231  
   232  L(divide_integer):
   233  	C eax	scratch (quotient)
   234  	C ebx	xsize
   235  	C ecx	counter
   236  	C edx	scratch (remainder)
   237  	C esi	src
   238  	C edi	&dst[xsize-1]
   239  	C ebp	divisor
   240  
   241  	movl	-4(%esi,%ecx,4), %eax
   242  
   243  	divl	%ebp
   244  
   245  	movl	%eax, (%edi,%ecx,4)
   246  	decl	%ecx
   247  	jnz	L(divide_integer)
   248  
   249  
   250  L(divide_no_integer):
   251  	movl	PARAM_DST, %edi
   252  	orl	%ebx, %ebx
   253  	jnz	L(divide_fraction)
   254  
   255  L(divide_done):
   256  	movl	SAVE_ESI, %esi
   257  
   258  	movl	SAVE_EDI, %edi
   259  
   260  	movl	SAVE_EBX, %ebx
   261  	movl	%edx, %eax
   262  
   263  	movl	SAVE_EBP, %ebp
   264  	addl	$STACK_SPACE, %esp
   265  
   266  	ret
   267  
   268  
   269  L(divide_fraction):
   270  	C eax	scratch (quotient)
   271  	C ebx	counter
   272  	C ecx
   273  	C edx	scratch (remainder)
   274  	C esi
   275  	C edi	dst
   276  	C ebp	divisor
   277  
   278  	movl	$0, %eax
   279  
   280  	divl	%ebp
   281  
   282  	movl	%eax, -4(%edi,%ebx,4)
   283  	decl	%ebx
   284  	jnz	L(divide_fraction)
   285  
   286  	jmp	L(divide_done)
   287  
   288  
   289  
   290  C -----------------------------------------------------------------------------
   291  
   292  L(mul_by_inverse):
   293  	C eax
   294  	C ebx	xsize
   295  	C ecx	size
   296  	C edx	carry
   297  	C esi	src
   298  	C edi	&dst[xsize-1]
   299  	C ebp	divisor
   300  
   301  	leal	12(%edi), %ebx		C &dst[xsize+2], loop dst stop
   302  
   303  	movl	%ebx, VAR_DST_STOP
   304  	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
   305  
   306  	movl	%edi, VAR_DST
   307  	movl	%ecx, %ebx		C size
   308  
   309  	bsrl	%ebp, %ecx		C 31-l
   310  	movl	%edx, %edi		C carry
   311  
   312  	leal	1(%ecx), %eax		C 32-l
   313  	xorl	$31, %ecx		C l
   314  
   315  	movl	%ecx, VAR_NORM
   316  	movl	$-1, %edx
   317  
   318  	shll	%cl, %ebp		C d normalized
   319  	movd	%eax, %mm7
   320  
   321  	movl	$-1, %eax
   322  	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
   323  
   324  	divl	%ebp			C floor (b*(b-d)-1) / d
   325  
   326  L(start_preinv):
   327  	C eax	inverse
   328  	C ebx	size
   329  	C ecx	shift
   330  	C edx
   331  	C esi	src
   332  	C edi	carry
   333  	C ebp	divisor
   334  	C
   335  	C mm7	rshift
   336  
   337  	movl	%eax, VAR_INVERSE
   338  	orl	%ebx, %ebx		C size
   339  	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
   340  
   341  	movl	%eax, VAR_SRC
   342  	jz	L(start_zero)
   343  
   344  	movl	8(%eax), %esi		C src high limb
   345  	cmpl	$1, %ebx
   346  	jz	L(start_one)
   347  
   348  L(start_two_or_more):
   349  	movl	4(%eax), %edx		C src second highest limb
   350  
   351  	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
   352  
   353  	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
   354  
   355  	cmpl	$2, %ebx
   356  	je	L(integer_two_left)
   357  	jmp	L(integer_top)
   358  
   359  
   360  L(start_one):
   361  	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
   362  
   363  	shll	%cl, %esi		C n10 = high << l
   364  	jmp	L(integer_one_left)
   365  
   366  
   367  L(start_zero):
   368  	C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
   369  	C skipped a division.
   370  
   371  	shll	%cl, %edi		C n2 = carry << l
   372  	movl	%edi, %eax		C return value for zero_done
   373  	cmpl	$0, PARAM_XSIZE
   374  
   375  	je	L(zero_done)
   376  	jmp	L(fraction_some)
   377  
   378  
   379  
   380  C -----------------------------------------------------------------------------
   381  C
   382  C This loop runs at about 25 cycles, which is probably sub-optimal, and
   383  C certainly more than the dependent chain would suggest.  A better loop, or
   384  C a better rough analysis of what's possible, would be welcomed.
   385  C
   386  C In the current implementation, the following successively dependent
   387  C micro-ops seem to exist.
   388  C
   389  C		       uops
   390  C		n2+n1	1   (addl)
   391  C		mul	5
   392  C		q1+1	3   (addl/adcl)
   393  C		mul	5
   394  C		sub	3   (subl/sbbl)
   395  C		addback	2   (cmov)
   396  C		       ---
   397  C		       19
   398  C
   399  C Lack of registers hinders explicit scheduling and it might be that the
   400  C normal out of order execution isn't able to hide enough under the mul
   401  C latencies.
   402  C
   403  C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
   404  C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
   405  C combination was tried for the addback (despite the fact it would lengthen
   406  C the dependent chain) but found to be no faster.
   407  
   408  
   409  	ALIGN(16)
   410  L(integer_top):
   411  	C eax	scratch
   412  	C ebx	scratch (nadj, q1)
   413  	C ecx	scratch (src, dst)
   414  	C edx	scratch
   415  	C esi	n10
   416  	C edi	n2
   417  	C ebp	d
   418  	C
   419  	C mm0	scratch (src qword)
   420  	C mm7	rshift for normalization
   421  
   422  	movl	%esi, %eax
   423  	movl	%ebp, %ebx
   424  
   425  	sarl	$31, %eax          C -n1
   426  	movl	VAR_SRC, %ecx
   427  
   428  	andl	%eax, %ebx         C -n1 & d
   429  	negl	%eax               C n1
   430  
   431  	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
   432  	addl	%edi, %eax         C n2+n1
   433  	movq	(%ecx), %mm0       C next src limb and the one below it
   434  
   435  	mull	VAR_INVERSE        C m*(n2+n1)
   436  
   437  	subl	$4, %ecx
   438  
   439  	movl	%ecx, VAR_SRC
   440  
   441  	C
   442  
   443  	C
   444  
   445  	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
   446  	movl	%ebp, %eax	   C d
   447  	leal	1(%edi), %ebx      C n2+1
   448  
   449  	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
   450  	jz	L(q1_ff)
   451  
   452  	mull	%ebx		   C (q1+1)*d
   453  
   454  	movl	VAR_DST, %ecx
   455  	psrlq	%mm7, %mm0
   456  
   457  	C
   458  
   459  	C
   460  
   461  	C
   462  
   463  	subl	%eax, %esi
   464  	movl	VAR_DST_STOP, %eax
   465  
   466  	sbbl	%edx, %edi	   C n - (q1+1)*d
   467  	movl	%esi, %edi	   C remainder -> n2
   468  	leal	(%ebp,%esi), %edx
   469  
   470  	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
   471  	movd	%mm0, %esi
   472  
   473  	sbbl	$0, %ebx	   C q
   474  	subl	$4, %ecx
   475  
   476  	movl	%ebx, (%ecx)
   477  	cmpl	%eax, %ecx
   478  
   479  	movl	%ecx, VAR_DST
   480  	jne	L(integer_top)
   481  
   482  
   483  L(integer_loop_done):
   484  
   485  
   486  C -----------------------------------------------------------------------------
   487  C
   488  C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
   489  C q1_ff special case.  This make the code a bit smaller and simpler, and
   490  C costs only 2 cycles (each).
   491  
   492  L(integer_two_left):
   493  	C eax	scratch
   494  	C ebx	scratch (nadj, q1)
   495  	C ecx	scratch (src, dst)
   496  	C edx	scratch
   497  	C esi	n10
   498  	C edi	n2
   499  	C ebp	divisor
   500  	C
   501  	C mm7	rshift
   502  
   503  
   504  	movl	%esi, %eax
   505  	movl	%ebp, %ebx
   506  
   507  	sarl	$31, %eax          C -n1
   508  	movl	PARAM_SRC, %ecx
   509  
   510  	andl	%eax, %ebx         C -n1 & d
   511  	negl	%eax               C n1
   512  
   513  	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
   514  	addl	%edi, %eax         C n2+n1
   515  
   516  	mull	VAR_INVERSE        C m*(n2+n1)
   517  
   518  	movd	(%ecx), %mm0	   C src low limb
   519  
   520  	movl	VAR_DST_STOP, %ecx
   521  
   522  	C
   523  
   524  	C
   525  
   526  	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
   527  	leal	1(%edi), %ebx      C n2+1
   528  	movl	%ebp, %eax	   C d
   529  
   530  	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
   531  
   532  	sbbl	$0, %ebx
   533  
   534  	mull	%ebx		   C (q1+1)*d
   535  
   536  	psllq	$32, %mm0
   537  
   538  	psrlq	%mm7, %mm0
   539  
   540  	C
   541  
   542  	C
   543  
   544  	subl	%eax, %esi
   545  
   546  	sbbl	%edx, %edi	   C n - (q1+1)*d
   547  	movl	%esi, %edi	   C remainder -> n2
   548  	leal	(%ebp,%esi), %edx
   549  
   550  	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
   551  	movd	%mm0, %esi
   552  
   553  	sbbl	$0, %ebx	   C q
   554  
   555  	movl	%ebx, -4(%ecx)
   556  
   557  
   558  C -----------------------------------------------------------------------------
   559  L(integer_one_left):
   560  	C eax	scratch
   561  	C ebx	scratch (nadj, q1)
   562  	C ecx	scratch (dst)
   563  	C edx	scratch
   564  	C esi	n10
   565  	C edi	n2
   566  	C ebp	divisor
   567  	C
   568  	C mm7	rshift
   569  
   570  
   571  	movl	%esi, %eax
   572  	movl	%ebp, %ebx
   573  
   574  	sarl	$31, %eax          C -n1
   575  	movl	VAR_DST_STOP, %ecx
   576  
   577  	andl	%eax, %ebx         C -n1 & d
   578  	negl	%eax               C n1
   579  
   580  	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
   581  	addl	%edi, %eax         C n2+n1
   582  
   583  	mull	VAR_INVERSE        C m*(n2+n1)
   584  
   585  	C
   586  
   587  	C
   588  
   589  	C
   590  
   591  	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
   592  	leal	1(%edi), %ebx      C n2+1
   593  	movl	%ebp, %eax	   C d
   594  
   595  	C
   596  
   597  	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
   598  
   599  	sbbl	$0, %ebx           C q1 if q1+1 overflowed
   600  
   601  	mull	%ebx
   602  
   603  	C
   604  
   605  	C
   606  
   607  	C
   608  
   609  	C
   610  
   611  	subl	%eax, %esi
   612  	movl	PARAM_XSIZE, %eax
   613  
   614  	sbbl	%edx, %edi	   C n - (q1+1)*d
   615  	movl	%esi, %edi	   C remainder -> n2
   616  	leal	(%ebp,%esi), %edx
   617  
   618  	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
   619  
   620  	sbbl	$0, %ebx	   C q
   621  
   622  	movl	%ebx, -8(%ecx)
   623  	subl	$8, %ecx
   624  
   625  
   626  
   627  	orl	%eax, %eax         C xsize
   628  	jnz	L(fraction_some)
   629  
   630  	movl	%edi, %eax
   631  L(fraction_done):
   632  	movl	VAR_NORM, %ecx
   633  L(zero_done):
   634  	movl	SAVE_EBP, %ebp
   635  
   636  	movl	SAVE_EDI, %edi
   637  
   638  	movl	SAVE_ESI, %esi
   639  
   640  	movl	SAVE_EBX, %ebx
   641  	addl	$STACK_SPACE, %esp
   642  
   643  	shrl	%cl, %eax
   644  	emms
   645  
   646  	ret
   647  
   648  
   649  C -----------------------------------------------------------------------------
   650  C
   651  C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
   652  C of q*d is simply -d and the remainder n-q*d = n10+d
   653  
   654  L(q1_ff):
   655  	C eax	(divisor)
   656  	C ebx	(q1+1 == 0)
   657  	C ecx
   658  	C edx
   659  	C esi	n10
   660  	C edi	n2
   661  	C ebp	divisor
   662  
   663  	movl	VAR_DST, %ecx
   664  	movl	VAR_DST_STOP, %edx
   665  	subl	$4, %ecx
   666  
   667  	movl	%ecx, VAR_DST
   668  	psrlq	%mm7, %mm0
   669  	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
   670  
   671  	movl	$-1, (%ecx)
   672  	movd	%mm0, %esi		C next n10
   673  
   674  	cmpl	%ecx, %edx
   675  	jne	L(integer_top)
   676  
   677  	jmp	L(integer_loop_done)
   678  
   679  
   680  
   681  C -----------------------------------------------------------------------------
   682  C
   683  C In the current implementation, the following successively dependent
   684  C micro-ops seem to exist.
   685  C
   686  C		       uops
   687  C		mul	5
   688  C		q1+1	1   (addl)
   689  C		mul	5
   690  C		sub	3   (negl/sbbl)
   691  C		addback	2   (cmov)
   692  C		       ---
   693  C		       16
   694  C
   695  C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
   696  C the addback was found to be a touch slower.
   697  
   698  
   699  	ALIGN(16)
   700  L(fraction_some):
   701  	C eax
   702  	C ebx
   703  	C ecx
   704  	C edx
   705  	C esi
   706  	C edi	carry
   707  	C ebp	divisor
   708  
   709  	movl	PARAM_DST, %esi
   710  	movl	VAR_DST_STOP, %ecx	C &dst[xsize+2]
   711  	movl	%edi, %eax
   712  
   713  	subl	$8, %ecx		C &dst[xsize]
   714  
   715  
   716  	ALIGN(16)
   717  L(fraction_top):
   718  	C eax	n2, then scratch
   719  	C ebx	scratch (nadj, q1)
   720  	C ecx	dst, decrementing
   721  	C edx	scratch
   722  	C esi	dst stop point
   723  	C edi	n2
   724  	C ebp	divisor
   725  
   726  	mull	VAR_INVERSE	C m*n2
   727  
   728  	movl	%ebp, %eax	C d
   729  	subl	$4, %ecx	C dst
   730  	leal	1(%edi), %ebx
   731  
   732  	C
   733  
   734  	C
   735  
   736  	C
   737  
   738  	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
   739  
   740  	mull	%ebx		C (q1+1)*d
   741  
   742  	C
   743  
   744  	C
   745  
   746  	C
   747  
   748  	C
   749  
   750  	negl	%eax		C low of n - (q1+1)*d
   751  
   752  	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
   753  	leal	(%ebp,%eax), %edx
   754  
   755  	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
   756  
   757  	sbbl	$0, %ebx	C q
   758  	movl	%eax, %edi	C remainder->n2
   759  	cmpl	%esi, %ecx
   760  
   761  	movl	%ebx, (%ecx)	C previous q
   762  	jne	L(fraction_top)
   763  
   764  
   765  	jmp	L(fraction_done)
   766  
   767  EPILOGUE()