github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/bdiv_q_1.asm (about)

     1  dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
     2  
     3  dnl  Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato.
     4  
     5  dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C       odd  even  divisor
    37  C P6:  10.0  12.0  cycles/limb
    38  
    39  C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
    40  
    41  C The odd case is basically the same as mpn_modexact_1_odd, just with an
    42  C extra store, and it runs at the same 10 cycles which is the dependent
    43  C chain.
    44  C
    45  C The shifts for the even case aren't on the dependent chain so in principle
    46  C it could run the same too, but nothing running at 10 has been found.
    47  C Perhaps there's too many uops (an extra 4 over the odd case).
    48  
    49  defframe(PARAM_SHIFT,  24)
    50  defframe(PARAM_INVERSE,20)
    51  defframe(PARAM_DIVISOR,16)
    52  defframe(PARAM_SIZE,   12)
    53  defframe(PARAM_SRC,     8)
    54  defframe(PARAM_DST,     4)
    55  
    56  defframe(SAVE_EBX,     -4)
    57  defframe(SAVE_ESI,     -8)
    58  defframe(SAVE_EDI,    -12)
    59  defframe(SAVE_EBP,    -16)
    60  deflit(STACK_SPACE, 16)
    61  
    62  dnl  re-use parameter space
    63  define(VAR_INVERSE,`PARAM_SRC')
    64  
    65  	TEXT
    66  
    67  C mp_limb_t
    68  C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
    69  C		    mp_limb_t inverse, int shift)
    70  
    71  	ALIGN(16)
    72  PROLOGUE(mpn_pi1_bdiv_q_1)
    73  deflit(`FRAME',0)
    74  
    75  	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
    76  
    77  	movl	%esi, SAVE_ESI
    78  	movl	PARAM_SRC, %esi
    79  
    80  	movl	%ebx, SAVE_EBX
    81  	movl	PARAM_SIZE, %ebx
    82  
    83  	movl	%ebp, SAVE_EBP
    84  	movl	PARAM_INVERSE, %ebp
    85  
    86  	movl	PARAM_SHIFT, %ecx	C trailing twos
    87  
    88  L(common):
    89  	movl	%edi, SAVE_EDI
    90  	movl	PARAM_DST, %edi
    91  
    92  	leal	(%esi,%ebx,4), %esi	C src end
    93  
    94  	leal	(%edi,%ebx,4), %edi	C dst end
    95  	negl	%ebx			C -size
    96  
    97  	movl	(%esi,%ebx,4), %eax	C src[0]
    98  
    99  	orl	%ecx, %ecx
   100  	jz	L(odd_entry)
   101  
   102  	movl	%edi, PARAM_DST
   103  	movl	%ebp, VAR_INVERSE
   104  
   105  L(even):
   106  	C eax	src[0]
   107  	C ebx	counter, limbs, negative
   108  	C ecx	shift
   109  	C edx
   110  	C esi
   111  	C edi
   112  	C ebp
   113  
   114  	xorl	%ebp, %ebp		C initial carry bit
   115  	xorl	%edx, %edx		C initial carry limb (for size==1)
   116  
   117  	incl	%ebx
   118  	jz	L(even_one)
   119  
   120  	movl	(%esi,%ebx,4), %edi	C src[1]
   121  
   122  	shrdl(	%cl, %edi, %eax)
   123  
   124  	jmp	L(even_entry)
   125  
   126  
   127  L(even_top):
   128  	C eax	scratch
   129  	C ebx	counter, limbs, negative
   130  	C ecx	shift
   131  	C edx	scratch
   132  	C esi	&src[size]
   133  	C edi	&dst[size] and scratch
   134  	C ebp	carry bit
   135  
   136  	movl	(%esi,%ebx,4), %edi
   137  
   138  	mull	PARAM_DIVISOR
   139  
   140  	movl	-4(%esi,%ebx,4), %eax
   141  	shrdl(	%cl, %edi, %eax)
   142  
   143  	subl	%ebp, %eax
   144  
   145  	sbbl	%ebp, %ebp
   146  	subl	%edx, %eax
   147  
   148  	sbbl	$0, %ebp
   149  
   150  L(even_entry):
   151  	imull	VAR_INVERSE, %eax
   152  
   153  	movl	PARAM_DST, %edi
   154  	negl	%ebp
   155  
   156  	movl	%eax, -4(%edi,%ebx,4)
   157  	incl	%ebx
   158  	jnz	L(even_top)
   159  
   160  	mull	PARAM_DIVISOR
   161  
   162  	movl	-4(%esi), %eax
   163  
   164  L(even_one):
   165  	shrl	%cl, %eax
   166  	movl	SAVE_ESI, %esi
   167  
   168  	subl	%ebp, %eax
   169  	movl	SAVE_EBP, %ebp
   170  
   171  	subl	%edx, %eax
   172  	movl	SAVE_EBX, %ebx
   173  
   174  	imull	VAR_INVERSE, %eax
   175  
   176  	movl	%eax, -4(%edi)
   177  	movl	SAVE_EDI, %edi
   178  	addl	$STACK_SPACE, %esp
   179  
   180  	ret
   181  
   182  C The dependent chain here is
   183  C
   184  C	subl	%edx, %eax       1
   185  C	imull	%ebp, %eax       4
   186  C	mull	PARAM_DIVISOR    5
   187  C			       ----
   188  C	total			10
   189  C
   190  C and this is the measured speed.  No special scheduling is necessary, out
   191  C of order execution hides the load latency.
   192  
   193  L(odd_top):
   194  	C eax	scratch (src limb)
   195  	C ebx	counter, limbs, negative
   196  	C ecx	carry bit
   197  	C edx	carry limb, high of last product
   198  	C esi	&src[size]
   199  	C edi	&dst[size]
   200  	C ebp	inverse
   201  
   202  	mull	PARAM_DIVISOR
   203  
   204  	movl	(%esi,%ebx,4), %eax
   205  	subl	%ecx, %eax
   206  
   207  	sbbl	%ecx, %ecx
   208  	subl	%edx, %eax
   209  
   210  	sbbl	$0, %ecx
   211  
   212  L(odd_entry):
   213  	imull	%ebp, %eax
   214  
   215  	movl	%eax, (%edi,%ebx,4)
   216  	negl	%ecx
   217  
   218  	incl	%ebx
   219  	jnz	L(odd_top)
   220  
   221  
   222  	movl	SAVE_ESI, %esi
   223  
   224  	movl	SAVE_EDI, %edi
   225  
   226  	movl	SAVE_EBP, %ebp
   227  
   228  	movl	SAVE_EBX, %ebx
   229  	addl	$STACK_SPACE, %esp
   230  
   231  	ret
   232  
   233  EPILOGUE()
   234  
   235  C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
   236  C                           mp_limb_t divisor);
   237  C
   238  
   239  	ALIGN(16)
   240  PROLOGUE(mpn_bdiv_q_1)
   241  deflit(`FRAME',0)
   242  
   243  	movl	PARAM_DIVISOR, %eax
   244  	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
   245  
   246  	movl	%esi, SAVE_ESI
   247  	movl	PARAM_SRC, %esi
   248  
   249  	movl	%ebx, SAVE_EBX
   250  	movl	PARAM_SIZE, %ebx
   251  
   252  	bsfl	%eax, %ecx		C trailing twos
   253  
   254  	movl	%ebp, SAVE_EBP
   255  
   256  	shrl	%cl, %eax		C d without twos
   257  
   258  	movl	%eax, %edx
   259  	shrl	%eax			C d/2 without twos
   260  
   261  	movl	%edx, PARAM_DIVISOR
   262  	andl	$127, %eax
   263  
   264  ifdef(`PIC',`
   265  	LEA(	binvert_limb_table, %ebp)
   266  	movzbl	(%eax,%ebp), %ebp		C inv 8 bits
   267  ',`
   268  	movzbl	binvert_limb_table(%eax), %ebp	C inv 8 bits
   269  ')
   270  
   271  	leal	(%ebp,%ebp), %eax	C 2*inv
   272  
   273  	imull	%ebp, %ebp		C inv*inv
   274  	imull	%edx, %ebp	C inv*inv*d
   275  
   276  	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
   277  	leal	(%eax,%eax), %ebp	C 2*inv
   278  
   279  	imull	%eax, %eax		C inv*inv
   280  	imull	%edx, %eax	C inv*inv*d
   281  
   282  	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
   283  
   284  	jmp	L(common)
   285  
   286  EPILOGUE()
   287  ASM_END()