github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/lshift.asm (about)

     1  dnl  AMD K6 mpn_lshift -- mpn left shift.
     2  
     3  dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6: 3.0 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    38  C                       unsigned shift);
    39  C
    40  C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
    41  C instructions.  This is despite every second fetch being unaligned.
    42  
    43  
    44  defframe(PARAM_SHIFT,16)
    45  defframe(PARAM_SIZE, 12)
    46  defframe(PARAM_SRC,  8)
    47  defframe(PARAM_DST,  4)
    48  
    49  	TEXT
    50  	ALIGN(32)
    51  
    52  PROLOGUE(mpn_lshift)
    53  deflit(`FRAME',0)
    54  
    55  	C The 1 limb case can be done without the push %ebx, but it's then
    56  	C still the same speed.  The push is left as a free helping hand for
    57  	C the two_or_more code.
    58  
    59  	movl	PARAM_SIZE, %eax
    60  	pushl	%ebx			FRAME_pushl()
    61  
    62  	movl	PARAM_SRC, %ebx
    63  	decl	%eax
    64  
    65  	movl	PARAM_SHIFT, %ecx
    66  	jnz	L(two_or_more)
    67  
    68  	movl	(%ebx), %edx		C src limb
    69  	movl	PARAM_DST, %ebx
    70  
    71  	shldl(	%cl, %edx, %eax)	C return value
    72  
    73  	shll	%cl, %edx
    74  
    75  	movl	%edx, (%ebx)		C dst limb
    76  	popl	%ebx
    77  
    78  	ret
    79  
    80  
    81  	ALIGN(16)	C avoid offset 0x1f
    82  	nop		C avoid bad cache line crossing
    83  L(two_or_more):
    84  	C eax	size-1
    85  	C ebx	src
    86  	C ecx	shift
    87  	C edx
    88  
    89  	movl	(%ebx,%eax,4), %edx	C src high limb
    90  	negl	%ecx
    91  
    92  	movd	PARAM_SHIFT, %mm6
    93  	addl	$32, %ecx		C 32-shift
    94  
    95  	shrl	%cl, %edx
    96  
    97  	movd	%ecx, %mm7
    98  	movl	PARAM_DST, %ecx
    99  
   100  L(top):
   101  	C eax	counter, size-1 to 1
   102  	C ebx	src
   103  	C ecx	dst
   104  	C edx	retval
   105  	C
   106  	C mm0	scratch
   107  	C mm6	shift
   108  	C mm7	32-shift
   109  
   110  	movq	-4(%ebx,%eax,4), %mm0
   111  	decl	%eax
   112  
   113  	psrlq	%mm7, %mm0
   114  
   115  	movd	%mm0, 4(%ecx,%eax,4)
   116  	jnz	L(top)
   117  
   118  
   119  	movd	(%ebx), %mm0
   120  	popl	%ebx
   121  
   122  	psllq	%mm6, %mm0
   123  	movl	%edx, %eax
   124  
   125  	movd	%mm0, (%ecx)
   126  
   127  	emms
   128  	ret
   129  
   130  EPILOGUE()