github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/mode1o.asm (about)

     1  dnl  Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
     2  
     3  dnl  Copyright 2000-2002, 2014 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P5: 23.0 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
    38  C                               mp_limb_t divisor);
    39  C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
    40  C                                mp_limb_t divisor, mp_limb_t carry);
    41  C
    42  C There seems no way to pair up the two lone instructions in the main loop.
    43  C
    44  C The special case for size==1 saves about 20 cycles (non-PIC), making it
    45  C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
    46  C all sizes.
    47  C
    48  C Alternatives:
    49  C
    50  C Using mmx for the multiplies might be possible, with pmullw and pmulhw
    51  C having just 3 cycle latencies, but carry bit handling would probably be
    52  C complicated.
    53  
    54  defframe(PARAM_CARRY,  16)
    55  defframe(PARAM_DIVISOR,12)
    56  defframe(PARAM_SIZE,   8)
    57  defframe(PARAM_SRC,    4)
    58  
    59  dnl  re-using parameter space
    60  define(VAR_INVERSE,`PARAM_SIZE')
    61  
    62  	TEXT
    63  
    64  	ALIGN(16)
    65  PROLOGUE(mpn_modexact_1c_odd)
    66  deflit(`FRAME',0)
    67  
    68  	movl	PARAM_DIVISOR, %eax
    69  	movl	PARAM_CARRY, %edx
    70  
    71  	jmp	L(start_1c)
    72  
    73  EPILOGUE()
    74  
    75  	ALIGN(16)
    76  PROLOGUE(mpn_modexact_1_odd)
    77  deflit(`FRAME',0)
    78  
    79  	movl	PARAM_DIVISOR, %eax
    80  	xorl	%edx, %edx		C carry
    81  
    82  L(start_1c):
    83  
    84  ifdef(`PIC',`
    85  ifdef(`DARWIN',`
    86  	shrl	%eax			C d/2
    87  	LEA(	binvert_limb_table, %ecx)
    88  	pushl	%ebx		FRAME_pushl()
    89  	movl	PARAM_SIZE, %ebx
    90  
    91  	andl	$127, %eax
    92  	subl	$2, %ebx
    93  
    94  	movb	(%eax,%ecx), %cl
    95  	jc	L(one_limb)
    96  ',`
    97  	call	L(here)		FRAME_pushl()
    98  L(here):
    99  
   100  	shrl	%eax			C d/2
   101  	movl	(%esp), %ecx		C eip
   102  
   103  	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
   104  	movl	%ebx, (%esp)		C push ebx
   105  
   106  	andl	$127, %eax
   107  	movl	PARAM_SIZE, %ebx
   108  
   109  	movl	binvert_limb_table@GOT(%ecx), %ecx
   110  	subl	$2, %ebx
   111  
   112  	movb	(%eax,%ecx), %cl			C inv 8 bits
   113  	jc	L(one_limb)
   114  ')
   115  ',`
   116  dnl non-PIC
   117  	shrl	%eax			C d/2
   118  	pushl	%ebx		FRAME_pushl()
   119  
   120  	movl	PARAM_SIZE, %ebx
   121  	andl	$127, %eax
   122  
   123  	subl	$2, %ebx
   124  	jc	L(one_limb)
   125  
   126  	movb	binvert_limb_table(%eax), %cl		C inv 8 bits
   127  ')
   128  
   129  	movl	%ecx, %eax
   130  	addl	%ecx, %ecx		C 2*inv
   131  
   132  	imull	%eax, %eax		C inv*inv
   133  
   134  	imull	PARAM_DIVISOR, %eax	C inv*inv*d
   135  
   136  	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
   137  
   138  	movl	%ecx, %eax
   139  	addl	%ecx, %ecx		C 2*inv
   140  
   141  	imull	%eax, %eax		C inv*inv
   142  
   143  	imull	PARAM_DIVISOR, %eax	C inv*inv*d
   144  
   145  	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
   146  	pushl	%esi		FRAME_pushl()
   147  
   148  	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
   149  	movl	%ecx, %eax
   150  	imull	PARAM_DIVISOR, %eax
   151  	cmpl	$1, %eax')
   152  
   153  	movl	PARAM_SRC, %esi
   154  	movl	%ecx, VAR_INVERSE
   155  
   156  	movl	(%esi), %eax		C src[0]
   157  	leal	4(%esi,%ebx,4), %esi	C &src[size-1]
   158  
   159  	xorl	$-1, %ebx		C -(size-1)
   160  	ASSERT(nz)
   161  	jmp	L(entry)
   162  
   163  
   164  C The use of VAR_INVERSE means only a store is needed for that value, rather
   165  C than a push and pop of say %edi.
   166  
   167  	ALIGN(16)
   168  L(top):
   169  	C eax	scratch, low product
   170  	C ebx	counter, limbs, negative
   171  	C ecx	carry bit
   172  	C edx	scratch, high product
   173  	C esi	&src[size-1]
   174  	C edi
   175  	C ebp
   176  
   177  	mull	PARAM_DIVISOR		C h:dummy = q*d
   178  
   179  	movl	(%esi,%ebx,4), %eax	C src[i]
   180  	subl	%ecx, %edx		C h -= -c
   181  
   182  L(entry):
   183  	subl	%edx, %eax		C s = src[i] - h
   184  
   185  	sbbl	%ecx, %ecx		C new -c (0 or -1)
   186  
   187  	imull	VAR_INVERSE, %eax	C q = s*i
   188  
   189  	incl	%ebx
   190  	jnz	L(top)
   191  
   192  
   193  	mull	PARAM_DIVISOR
   194  
   195  	movl	(%esi), %eax		C src high
   196  	subl	%ecx, %edx		C h -= -c
   197  
   198  	cmpl	PARAM_DIVISOR, %eax
   199  
   200  	jbe	L(skip_last)
   201  deflit(FRAME_LAST,FRAME)
   202  
   203  
   204  	subl	%edx, %eax		C s = src[i] - h
   205  	popl	%esi		FRAME_popl()
   206  
   207  	sbbl	%ecx, %ecx		C c (0 or -1)
   208  	popl	%ebx		FRAME_popl()
   209  
   210  	imull	VAR_INVERSE, %eax	C q = s*i
   211  
   212  	mull	PARAM_DIVISOR		C h:dummy = q*d
   213  
   214  	movl	%edx, %eax
   215  
   216  	subl	%ecx, %eax
   217  
   218  	ret
   219  
   220  
   221  C When high<divisor can skip last step.
   222  
   223  L(skip_last):
   224  deflit(`FRAME',FRAME_LAST)
   225  	C eax	src high
   226  	C ebx
   227  	C ecx
   228  	C edx	r
   229  	C esi
   230  
   231  	subl	%eax, %edx	C r-s
   232  	popl	%esi		FRAME_popl()
   233  
   234  	sbbl	%eax, %eax	C -1 if underflow
   235  	movl	PARAM_DIVISOR, %ebx
   236  
   237  	andl	%ebx, %eax	C divisor if underflow
   238  	popl	%ebx		FRAME_popl()
   239  
   240  	addl	%edx, %eax	C addback if underflow
   241  
   242  	ret
   243  
   244  
   245  C Special case for size==1 using a division for r = c-a mod d.
   246  C Could look for a-c<d and save a division sometimes, but that doesn't seem
   247  C worth bothering about.
   248  
   249  L(one_limb):
   250  deflit(`FRAME',4)
   251  	C eax
   252  	C ebx	size-2 (==-1)
   253  	C ecx
   254  	C edx	carry
   255  	C esi	src end
   256  	C edi
   257  	C ebp
   258  
   259  	movl	%edx, %eax
   260  	movl	PARAM_SRC, %edx
   261  
   262  	movl	PARAM_DIVISOR, %ecx
   263  	popl	%ebx		FRAME_popl()
   264  
   265  	subl	(%edx), %eax		C c-a
   266  
   267  	sbbl	%edx, %edx
   268  	decl	%ecx			C d-1
   269  
   270  	andl	%ecx, %edx		C b*d+c-a if c<a, or c-a if c>=a
   271  
   272  	divl	PARAM_DIVISOR
   273  
   274  	movl	%edx, %eax
   275  
   276  	ret
   277  
   278  EPILOGUE()
   279  ASM_END()