github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/mode1o.asm (about)

     1  dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
     2  
     3  dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P6: 10.0 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
    38  C                               mp_limb_t divisor);
    39  C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
    40  C                                mp_limb_t divisor, mp_limb_t carry);
    41  C
    42  C It's not worth skipping a step at the end when high<divisor since the main
    43  C loop is only 10 cycles.
    44  
    45  defframe(PARAM_CARRY,  16)
    46  defframe(PARAM_DIVISOR,12)
    47  defframe(PARAM_SIZE,   8)
    48  defframe(PARAM_SRC,    4)
    49  
    50  dnl  Not enough room under modexact_1 to make these re-use the parameter
    51  dnl  space, unfortunately.
    52  defframe(SAVE_EBX,     -4)
    53  defframe(SAVE_ESI,     -8)
    54  defframe(SAVE_EDI,    -12)
    55  deflit(STACK_SPACE, 12)
    56  
    57  	TEXT
    58  
    59  	ALIGN(16)
    60  PROLOGUE(mpn_modexact_1c_odd)
    61  deflit(`FRAME',0)
    62  
    63  	movl	PARAM_CARRY, %ecx
    64  	jmp	L(start_1c)
    65  
    66  EPILOGUE()
    67  
    68  	ALIGN(16)
    69  PROLOGUE(mpn_modexact_1_odd)
    70  deflit(`FRAME',0)
    71  
    72  	xorl	%ecx, %ecx
    73  L(start_1c):
    74  	movl	PARAM_DIVISOR, %eax
    75  
    76  	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
    77  
    78  	movl	%esi, SAVE_ESI
    79  	movl	PARAM_SRC, %esi
    80  
    81  	shrl	%eax			C d/2
    82  	movl	%edi, SAVE_EDI
    83  
    84  	andl	$127, %eax
    85  
    86  ifdef(`PIC',`
    87  	LEA(	binvert_limb_table, %edi)
    88  	movzbl	(%eax,%edi), %edi		C inv 8 bits
    89  ',`
    90  	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
    91  ')
    92  
    93  	xorl	%edx, %edx		C initial extra carry
    94  	leal	(%edi,%edi), %eax	C 2*inv
    95  
    96  	imull	%edi, %edi		C inv*inv
    97  
    98  	movl	%ebx, SAVE_EBX
    99  	movl	PARAM_SIZE, %ebx
   100  
   101  	imull	PARAM_DIVISOR, %edi	C inv*inv*d
   102  
   103  	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
   104  	leal	(%eax,%eax), %edi	C 2*inv
   105  
   106  	imull	%eax, %eax		C inv*inv
   107  
   108  	imull	PARAM_DIVISOR, %eax	C inv*inv*d
   109  
   110  	leal	(%esi,%ebx,4), %esi	C src end
   111  	negl	%ebx			C -size
   112  
   113  	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
   114  
   115  	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
   116  	movl	PARAM_DIVISOR, %eax
   117  	imull	%edi, %eax
   118  	cmpl	$1, %eax')
   119  
   120  
   121  C The dependent chain here is
   122  C
   123  C	subl	%edx, %eax       1
   124  C	imull	%edi, %eax       4
   125  C	mull	PARAM_DIVISOR    5
   126  C			       ----
   127  C	total			10
   128  C
   129  C and this is the measured speed.  No special scheduling is necessary, out
   130  C of order execution hides the load latency.
   131  
   132  L(top):
   133  	C eax	scratch (src limb)
   134  	C ebx	counter, limbs, negative
   135  	C ecx	carry bit, 0 or 1
   136  	C edx	carry limb, high of last product
   137  	C esi	&src[size]
   138  	C edi	inverse
   139  	C ebp
   140  
   141  	movl	(%esi,%ebx,4), %eax
   142  	subl	%ecx, %eax
   143  
   144  	sbbl	%ecx, %ecx
   145  	subl	%edx, %eax
   146  
   147  	sbbl	$0, %ecx
   148  
   149  	imull	%edi, %eax
   150  
   151  	negl	%ecx
   152  
   153  	mull	PARAM_DIVISOR
   154  
   155  	incl	%ebx
   156  	jnz	L(top)
   157  
   158  
   159  	movl	SAVE_ESI, %esi
   160  	leal	(%ecx,%edx), %eax
   161  
   162  	movl	SAVE_EDI, %edi
   163  
   164  	movl	SAVE_EBX, %ebx
   165  	addl	$STACK_SPACE, %esp
   166  
   167  	ret
   168  
   169  EPILOGUE()
   170  ASM_END()