github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/mode1o.asm (about)

     1  dnl  Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
     2  
     3  dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P4: 19.0 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
    38  C                               mp_limb_t divisor);
    39  C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
    40  C                                mp_limb_t divisor, mp_limb_t carry);
    41  C
    42  
    43  defframe(PARAM_CARRY,  16)
    44  defframe(PARAM_DIVISOR,12)
    45  defframe(PARAM_SIZE,   8)
    46  defframe(PARAM_SRC,    4)
    47  
    48  	TEXT
    49  
    50  	ALIGN(16)
    51  PROLOGUE(mpn_modexact_1c_odd)
    52  deflit(`FRAME',0)
    53  
    54  	movd	PARAM_CARRY, %mm1
    55  	jmp	L(start_1c)
    56  
    57  EPILOGUE()
    58  
    59  
    60  	ALIGN(16)
    61  PROLOGUE(mpn_modexact_1_odd)
    62  deflit(`FRAME',0)
    63  
    64  	pxor	%mm1, %mm1		C carry limb
    65  L(start_1c):
    66  	movl	PARAM_DIVISOR, %eax
    67  
    68  	movd	PARAM_DIVISOR, %mm7
    69  
    70  	shrl	%eax
    71  
    72  	andl	$127, %eax		C d/2, 7 bits
    73  
    74  ifdef(`PIC',`
    75  	LEA(	binvert_limb_table, %edx)
    76  	movzbl	(%eax,%edx), %eax		C inv 8 bits
    77  ',`
    78  	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
    79  ')
    80  
    81  	C
    82  
    83  	movd	%eax, %mm6		C inv
    84  
    85  	movd	%eax, %mm0		C inv
    86  
    87  	pmuludq	%mm6, %mm6		C inv*inv
    88  
    89  	C
    90  
    91  	pmuludq	%mm7, %mm6		C inv*inv*d
    92  	paddd	%mm0, %mm0		C 2*inv
    93  
    94  	C
    95  
    96  	psubd	%mm6, %mm0		C inv = 2*inv - inv*inv*d
    97  	pxor	%mm6, %mm6
    98  
    99  	paddd	%mm0, %mm6
   100  	pmuludq	%mm0, %mm0		C inv*inv
   101  
   102  	C
   103  
   104  	pmuludq	%mm7, %mm0		C inv*inv*d
   105  	paddd	%mm6, %mm6		C 2*inv
   106  
   107  
   108  	movl	PARAM_SRC, %eax
   109  	movl	PARAM_SIZE, %ecx
   110  
   111  	C
   112  
   113  	psubd	%mm0, %mm6		C inv = 2*inv - inv*inv*d
   114  
   115  	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
   116  	pushl	%eax	FRAME_pushl()
   117  	movd	%mm6, %eax
   118  	imul	PARAM_DIVISOR, %eax
   119  	cmpl	$1, %eax
   120  	popl	%eax	FRAME_popl()')
   121  
   122  	pxor	%mm0, %mm0		C carry bit
   123  
   124  
   125  C The dependent chain here is as follows.
   126  C
   127  C					latency
   128  C	psubq	 s = (src-cbit) - climb	   2
   129  C	pmuludq	 q = s*inverse		   8
   130  C	pmuludq	 prod = q*divisor	   8
   131  C	psrlq	 climb = high(prod)	   2
   132  C					  --
   133  C					  20
   134  C
   135  C Yet the loop measures 19.0 c/l, so obviously there's something gained
   136  C there over a straight reading of the chip documentation.
   137  
   138  L(top):
   139  	C eax	src, incrementing
   140  	C ebx
   141  	C ecx	counter, limbs
   142  	C edx
   143  	C
   144  	C mm0	carry bit
   145  	C mm1	carry limb
   146  	C mm6	inverse
   147  	C mm7	divisor
   148  
   149  	movd	(%eax), %mm2
   150  	addl	$4, %eax
   151  
   152  	psubq	%mm0, %mm2		C src - cbit
   153  
   154  	psubq	%mm1, %mm2		C src - cbit - climb
   155  	movq	%mm2, %mm0
   156  	psrlq	$63, %mm0		C new cbit
   157  
   158  	pmuludq	%mm6, %mm2		C s*inverse
   159  
   160  	movq	%mm7, %mm1
   161  	pmuludq	%mm2, %mm1		C q*divisor
   162  	psrlq	$32, %mm1		C new climb
   163  
   164  	subl	$1, %ecx
   165  	jnz	L(top)
   166  
   167  
   168  L(done):
   169  	paddq	%mm1, %mm0
   170  	movd	%mm0, %eax
   171  	emms
   172  	ret
   173  
   174  EPILOGUE()
   175  ASM_END()