github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/rshift.asm (about)

     1  dnl  AMD K6 mpn_rshift -- mpn right shift.
     2  
     3  dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6: 3.0 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    38  C                       unsigned shift);
    39  C
    40  C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
    41  C instructions.  This is despite every second fetch being unaligned.
    42  
    43  
    44  defframe(PARAM_SHIFT,16)
    45  defframe(PARAM_SIZE, 12)
    46  defframe(PARAM_SRC,  8)
    47  defframe(PARAM_DST,  4)
    48  deflit(`FRAME',0)
    49  
    50  	TEXT
    51  	ALIGN(32)
    52  
    53  PROLOGUE(mpn_rshift)
    54  deflit(`FRAME',0)
    55  
    56  	C The 1 limb case can be done without the push %ebx, but it's then
    57  	C still the same speed.  The push is left as a free helping hand for
    58  	C the two_or_more code.
    59  
    60  	movl	PARAM_SIZE, %eax
    61  	pushl	%ebx			FRAME_pushl()
    62  
    63  	movl	PARAM_SRC, %ebx
    64  	decl	%eax
    65  
    66  	movl	PARAM_SHIFT, %ecx
    67  	jnz	L(two_or_more)
    68  
    69  	movl	(%ebx), %edx		C src limb
    70  	movl	PARAM_DST, %ebx
    71  
    72  	shrdl(	%cl, %edx, %eax)	C return value
    73  
    74  	shrl	%cl, %edx
    75  
    76  	movl	%edx, (%ebx)		C dst limb
    77  	popl	%ebx
    78  
    79  	ret
    80  
    81  
    82  	ALIGN(16)	C avoid offset 0x1f
    83  L(two_or_more):
    84  	C eax	size-1
    85  	C ebx	src
    86  	C ecx	shift
    87  	C edx
    88  
    89  	movl	(%ebx), %edx	C src low limb
    90  	negl	%ecx
    91  
    92  	addl	$32, %ecx	C 32-shift
    93  	movd	PARAM_SHIFT, %mm6
    94  
    95  	shll	%cl, %edx	C retval
    96  	movl	PARAM_DST, %ecx
    97  
    98  	leal	(%ebx,%eax,4), %ebx
    99  
   100  	leal	-4(%ecx,%eax,4), %ecx
   101  	negl	%eax
   102  
   103  
   104  L(simple):
   105  	C eax	counter (negative)
   106  	C ebx	&src[size-1]
   107  	C ecx	&dst[size-1]
   108  	C edx	retval
   109  	C
   110  	C mm0	scratch
   111  	C mm6	shift
   112  
   113  Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
   114  	incl	%eax
   115  
   116  	psrlq	%mm6, %mm0
   117  
   118  Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
   119  	jnz	L(simple)
   120  
   121  
   122  	movq	%mm0, (%ecx)
   123  	movl	%edx, %eax
   124  
   125  	popl	%ebx
   126  
   127  	emms
   128  	ret
   129  
   130  EPILOGUE()