github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/rshift.asm (about)

     1  dnl  Intel Pentium mpn_rshift -- mpn right shift.
     2  
     3  dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C         cycles/limb
    35  C P5,P54:    6.0
    36  C P55:       5.375
    37  
    38  
    39  C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    40  C                       unsigned shift);
    41  C
    42  C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
    43  C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
    44  
    45  defframe(PARAM_SHIFT,16)
    46  defframe(PARAM_SIZE, 12)
    47  defframe(PARAM_SRC,  8)
    48  defframe(PARAM_DST,  4)
    49  
    50  	TEXT
    51  	ALIGN(8)
    52  PROLOGUE(mpn_rshift)
    53  
    54  	pushl	%edi
    55  	pushl	%esi
    56  	pushl	%ebx
    57  	pushl	%ebp
    58  deflit(`FRAME',16)
    59  
    60  	movl	PARAM_DST,%edi
    61  	movl	PARAM_SRC,%esi
    62  	movl	PARAM_SIZE,%ebp
    63  	movl	PARAM_SHIFT,%ecx
    64  
    65  C We can use faster code for shift-by-1 under certain conditions.
    66  	cmp	$1,%ecx
    67  	jne	L(normal)
    68  	leal	4(%edi),%eax
    69  	cmpl	%esi,%eax
    70  	jnc	L(special)		C jump if res_ptr + 1 >= s_ptr
    71  	leal	(%edi,%ebp,4),%eax
    72  	cmpl	%eax,%esi
    73  	jnc	L(special)		C jump if s_ptr >= res_ptr + size
    74  
    75  L(normal):
    76  	movl	(%esi),%edx
    77  	addl	$4,%esi
    78  	xorl	%eax,%eax
    79  	shrdl(	%cl, %edx, %eax)	C compute carry limb
    80  	pushl	%eax			C push carry limb onto stack
    81  
    82  	decl	%ebp
    83  	pushl	%ebp
    84  	shrl	$3,%ebp
    85  	jz	L(end)
    86  
    87  	movl	(%edi),%eax		C fetch destination cache line
    88  
    89  	ALIGN(4)
    90  L(oop):	movl	28(%edi),%eax		C fetch destination cache line
    91  	movl	%edx,%ebx
    92  
    93  	movl	(%esi),%eax
    94  	movl	4(%esi),%edx
    95  	shrdl(	%cl, %eax, %ebx)
    96  	shrdl(	%cl, %edx, %eax)
    97  	movl	%ebx,(%edi)
    98  	movl	%eax,4(%edi)
    99  
   100  	movl	8(%esi),%ebx
   101  	movl	12(%esi),%eax
   102  	shrdl(	%cl, %ebx, %edx)
   103  	shrdl(	%cl, %eax, %ebx)
   104  	movl	%edx,8(%edi)
   105  	movl	%ebx,12(%edi)
   106  
   107  	movl	16(%esi),%edx
   108  	movl	20(%esi),%ebx
   109  	shrdl(	%cl, %edx, %eax)
   110  	shrdl(	%cl, %ebx, %edx)
   111  	movl	%eax,16(%edi)
   112  	movl	%edx,20(%edi)
   113  
   114  	movl	24(%esi),%eax
   115  	movl	28(%esi),%edx
   116  	shrdl(	%cl, %eax, %ebx)
   117  	shrdl(	%cl, %edx, %eax)
   118  	movl	%ebx,24(%edi)
   119  	movl	%eax,28(%edi)
   120  
   121  	addl	$32,%esi
   122  	addl	$32,%edi
   123  	decl	%ebp
   124  	jnz	L(oop)
   125  
   126  L(end):	popl	%ebp
   127  	andl	$7,%ebp
   128  	jz	L(end2)
   129  L(oop2):
   130  	movl	(%esi),%eax
   131  	shrdl(	%cl,%eax,%edx)		C compute result limb
   132  	movl	%edx,(%edi)
   133  	movl	%eax,%edx
   134  	addl	$4,%esi
   135  	addl	$4,%edi
   136  	decl	%ebp
   137  	jnz	L(oop2)
   138  
   139  L(end2):
   140  	shrl	%cl,%edx		C compute most significant limb
   141  	movl	%edx,(%edi)		C store it
   142  
   143  	popl	%eax			C pop carry limb
   144  
   145  	popl	%ebp
   146  	popl	%ebx
   147  	popl	%esi
   148  	popl	%edi
   149  	ret
   150  
   151  
   152  C We loop from least significant end of the arrays, which is only
   153  C permissable if the source and destination don't overlap, since the
   154  C function is documented to work for overlapping source and destination.
   155  
   156  L(special):
   157  	leal	-4(%edi,%ebp,4),%edi
   158  	leal	-4(%esi,%ebp,4),%esi
   159  
   160  	movl	(%esi),%edx
   161  	subl	$4,%esi
   162  
   163  	decl	%ebp
   164  	pushl	%ebp
   165  	shrl	$3,%ebp
   166  
   167  	shrl	%edx
   168  	incl	%ebp
   169  	decl	%ebp
   170  	jz	L(Lend)
   171  
   172  	movl	(%edi),%eax		C fetch destination cache line
   173  
   174  	ALIGN(4)
   175  L(Loop):
   176  	movl	-28(%edi),%eax		C fetch destination cache line
   177  	movl	%edx,%ebx
   178  
   179  	movl	(%esi),%eax
   180  	movl	-4(%esi),%edx
   181  	rcrl	%eax
   182  	movl	%ebx,(%edi)
   183  	rcrl	%edx
   184  	movl	%eax,-4(%edi)
   185  
   186  	movl	-8(%esi),%ebx
   187  	movl	-12(%esi),%eax
   188  	rcrl	%ebx
   189  	movl	%edx,-8(%edi)
   190  	rcrl	%eax
   191  	movl	%ebx,-12(%edi)
   192  
   193  	movl	-16(%esi),%edx
   194  	movl	-20(%esi),%ebx
   195  	rcrl	%edx
   196  	movl	%eax,-16(%edi)
   197  	rcrl	%ebx
   198  	movl	%edx,-20(%edi)
   199  
   200  	movl	-24(%esi),%eax
   201  	movl	-28(%esi),%edx
   202  	rcrl	%eax
   203  	movl	%ebx,-24(%edi)
   204  	rcrl	%edx
   205  	movl	%eax,-28(%edi)
   206  
   207  	leal	-32(%esi),%esi		C use leal not to clobber carry
   208  	leal	-32(%edi),%edi
   209  	decl	%ebp
   210  	jnz	L(Loop)
   211  
   212  L(Lend):
   213  	popl	%ebp
   214  	sbbl	%eax,%eax		C save carry in %eax
   215  	andl	$7,%ebp
   216  	jz	L(Lend2)
   217  	addl	%eax,%eax		C restore carry from eax
   218  L(Loop2):
   219  	movl	%edx,%ebx
   220  	movl	(%esi),%edx
   221  	rcrl	%edx
   222  	movl	%ebx,(%edi)
   223  
   224  	leal	-4(%esi),%esi		C use leal not to clobber carry
   225  	leal	-4(%edi),%edi
   226  	decl	%ebp
   227  	jnz	L(Loop2)
   228  
   229  	jmp	L(L1)
   230  L(Lend2):
   231  	addl	%eax,%eax		C restore carry from eax
   232  L(L1):	movl	%edx,(%edi)		C store last limb
   233  
   234  	movl	$0,%eax
   235  	rcrl	%eax
   236  
   237  	popl	%ebp
   238  	popl	%ebx
   239  	popl	%esi
   240  	popl	%edi
   241  	ret
   242  
   243  EPILOGUE()