github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/dive_1.asm (about)

     1  dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
     2  
     3  dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C       odd  even  divisor
    35  C P6:  10.0  12.0  cycles/limb
    36  
    37  
    38  C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    39  C                      mp_limb_t divisor);
    40  C
    41  C The odd case is basically the same as mpn_modexact_1_odd, just with an
    42  C extra store, and it runs at the same 10 cycles which is the dependent
    43  C chain.
    44  C
    45  C The shifts for the even case aren't on the dependent chain so in principle
    46  C it could run the same too, but nothing running at 10 has been found.
    47  C Perhaps there's too many uops (an extra 4 over the odd case).
    48  
    49  defframe(PARAM_DIVISOR,16)
    50  defframe(PARAM_SIZE,   12)
    51  defframe(PARAM_SRC,     8)
    52  defframe(PARAM_DST,     4)
    53  
    54  defframe(SAVE_EBX,     -4)
    55  defframe(SAVE_ESI,     -8)
    56  defframe(SAVE_EDI,    -12)
    57  defframe(SAVE_EBP,    -16)
    58  defframe(VAR_INVERSE, -20)
    59  deflit(STACK_SPACE, 20)
    60  
    61  	TEXT
    62  
    63  	ALIGN(16)
    64  PROLOGUE(mpn_divexact_1)
    65  deflit(`FRAME',0)
    66  
    67  	movl	PARAM_DIVISOR, %eax
    68  	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
    69  
    70  	movl	%esi, SAVE_ESI
    71  	movl	PARAM_SRC, %esi
    72  
    73  	movl	%ebx, SAVE_EBX
    74  	movl	PARAM_SIZE, %ebx
    75  
    76  	bsfl	%eax, %ecx		C trailing twos
    77  
    78  	movl	%ebp, SAVE_EBP
    79  
    80  	shrl	%cl, %eax		C d without twos
    81  
    82  	movl	%eax, %edx
    83  	shrl	%eax			C d/2 without twos
    84  
    85  	movl	%edx, PARAM_DIVISOR
    86  	andl	$127, %eax
    87  
    88  ifdef(`PIC',`
    89  	LEA(	binvert_limb_table, %ebp)
    90  	movzbl	(%eax,%ebp), %ebp		C inv 8 bits
    91  ',`
    92  	movzbl	binvert_limb_table(%eax), %ebp	C inv 8 bits
    93  ')
    94  
    95  	leal	(%ebp,%ebp), %eax	C 2*inv
    96  
    97  	imull	%ebp, %ebp		C inv*inv
    98  
    99  	movl	%edi, SAVE_EDI
   100  	movl	PARAM_DST, %edi
   101  
   102  	leal	(%esi,%ebx,4), %esi	C src end
   103  
   104  	imull	PARAM_DIVISOR, %ebp	C inv*inv*d
   105  
   106  	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
   107  	leal	(%eax,%eax), %ebp	C 2*inv
   108  
   109  	imull	%eax, %eax		C inv*inv
   110  
   111  	leal	(%edi,%ebx,4), %edi	C dst end
   112  	negl	%ebx			C -size
   113  
   114  	movl	%edi, PARAM_DST
   115  
   116  	imull	PARAM_DIVISOR, %eax	C inv*inv*d
   117  
   118  	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
   119  
   120  	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
   121  	movl	PARAM_DIVISOR, %eax
   122  	imull	%ebp, %eax
   123  	cmpl	$1, %eax')
   124  
   125  	movl	%ebp, VAR_INVERSE
   126  	movl	(%esi,%ebx,4), %eax	C src[0]
   127  
   128  	orl	%ecx, %ecx
   129  	jnz	L(even)
   130  
   131  	C ecx initial carry is zero
   132  	jmp	L(odd_entry)
   133  
   134  
   135  C The dependent chain here is
   136  C
   137  C	subl	%edx, %eax       1
   138  C	imull	%ebp, %eax       4
   139  C	mull	PARAM_DIVISOR    5
   140  C			       ----
   141  C	total			10
   142  C
   143  C and this is the measured speed.  No special scheduling is necessary, out
   144  C of order execution hides the load latency.
   145  
   146  L(odd_top):
   147  	C eax	scratch (src limb)
   148  	C ebx	counter, limbs, negative
   149  	C ecx	carry bit
   150  	C edx	carry limb, high of last product
   151  	C esi	&src[size]
   152  	C edi	&dst[size]
   153  	C ebp
   154  
   155  	mull	PARAM_DIVISOR
   156  
   157  	movl	(%esi,%ebx,4), %eax
   158  	subl	%ecx, %eax
   159  
   160  	sbbl	%ecx, %ecx
   161  	subl	%edx, %eax
   162  
   163  	sbbl	$0, %ecx
   164  
   165  L(odd_entry):
   166  	imull	VAR_INVERSE, %eax
   167  
   168  	movl	%eax, (%edi,%ebx,4)
   169  	negl	%ecx
   170  
   171  	incl	%ebx
   172  	jnz	L(odd_top)
   173  
   174  
   175  	movl	SAVE_ESI, %esi
   176  
   177  	movl	SAVE_EDI, %edi
   178  
   179  	movl	SAVE_EBP, %ebp
   180  
   181  	movl	SAVE_EBX, %ebx
   182  	addl	$STACK_SPACE, %esp
   183  
   184  	ret
   185  
   186  
   187  L(even):
   188  	C eax	src[0]
   189  	C ebx	counter, limbs, negative
   190  	C ecx	shift
   191  	C edx
   192  	C esi
   193  	C edi
   194  	C ebp
   195  
   196  	xorl	%ebp, %ebp		C initial carry bit
   197  	xorl	%edx, %edx		C initial carry limb (for size==1)
   198  
   199  	incl	%ebx
   200  	jz	L(even_one)
   201  
   202  	movl	(%esi,%ebx,4), %edi	C src[1]
   203  
   204  	shrdl(	%cl, %edi, %eax)
   205  
   206  	jmp	L(even_entry)
   207  
   208  
   209  L(even_top):
   210  	C eax	scratch
   211  	C ebx	counter, limbs, negative
   212  	C ecx	shift
   213  	C edx	scratch
   214  	C esi	&src[size]
   215  	C edi	&dst[size] and scratch
   216  	C ebp	carry bit
   217  
   218  	movl	(%esi,%ebx,4), %edi
   219  
   220  	mull	PARAM_DIVISOR
   221  
   222  	movl	-4(%esi,%ebx,4), %eax
   223  	shrdl(	%cl, %edi, %eax)
   224  
   225  	subl	%ebp, %eax
   226  
   227  	sbbl	%ebp, %ebp
   228  	subl	%edx, %eax
   229  
   230  	sbbl	$0, %ebp
   231  
   232  L(even_entry):
   233  	imull	VAR_INVERSE, %eax
   234  
   235  	movl	PARAM_DST, %edi
   236  	negl	%ebp
   237  
   238  	movl	%eax, -4(%edi,%ebx,4)
   239  	incl	%ebx
   240  	jnz	L(even_top)
   241  
   242  
   243  
   244  	mull	PARAM_DIVISOR
   245  
   246  	movl	-4(%esi), %eax
   247  
   248  L(even_one):
   249  	shrl	%cl, %eax
   250  	movl	SAVE_ESI, %esi
   251  
   252  	subl	%ebp, %eax
   253  	movl	SAVE_EBP, %ebp
   254  
   255  	subl	%edx, %eax
   256  	movl	SAVE_EBX, %ebx
   257  
   258  	imull	VAR_INVERSE, %eax
   259  
   260  	movl	%eax, -4(%edi)
   261  	movl	SAVE_EDI, %edi
   262  	addl	$STACK_SPACE, %esp
   263  
   264  	ret
   265  
   266  EPILOGUE()
   267  ASM_END()