github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/k62mmx/lshift.asm (about)

     1  dnl  AMD K6-2 mpn_lshift -- mpn left shift.
     2  
     3  dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6-2: 1.75 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    38  C                       unsigned shift);
    39  C
    40  
    41  defframe(PARAM_SHIFT,16)
    42  defframe(PARAM_SIZE, 12)
    43  defframe(PARAM_SRC,  8)
    44  defframe(PARAM_DST,  4)
    45  deflit(`FRAME',0)
    46  
    47  dnl  used after src has been fetched
    48  define(VAR_RETVAL,`PARAM_SRC')
    49  
    50  dnl  minimum 9, because unrolled loop can't handle less
    51  deflit(UNROLL_THRESHOLD, 9)
    52  
    53  	TEXT
    54  	ALIGN(32)
    55  
    56  PROLOGUE(mpn_lshift)
    57  deflit(`FRAME',0)
    58  
    59  	C The 1 limb case can be done without the push %ebx, but it's then
    60  	C still the same speed.  The push is left as a free helping hand for
    61  	C the two_or_more code.
    62  
    63  	movl	PARAM_SIZE, %eax
    64  	pushl	%ebx			FRAME_pushl()
    65  
    66  	movl	PARAM_SRC, %ebx
    67  	decl	%eax
    68  
    69  	movl	PARAM_SHIFT, %ecx
    70  	jnz	L(two_or_more)
    71  
    72  	movl	(%ebx), %edx		C src limb
    73  	movl	PARAM_DST, %ebx
    74  
    75  	shldl(	%cl, %edx, %eax)	C return value
    76  
    77  	shll	%cl, %edx
    78  
    79  	movl	%edx, (%ebx)		C dst limb
    80  	popl	%ebx
    81  
    82  	ret
    83  
    84  
    85  C -----------------------------------------------------------------------------
    86  	ALIGN(16)	C avoid offset 0x1f
    87  L(two_or_more):
    88  	C eax	size-1
    89  	C ebx	src
    90  	C ecx	shift
    91  	C edx
    92  
    93  	movl	(%ebx,%eax,4), %edx	C src high limb
    94  	negl	%ecx
    95  
    96  	movd	PARAM_SHIFT, %mm6
    97  	addl	$32, %ecx		C 32-shift
    98  
    99  	shrl	%cl, %edx
   100  	cmpl	$UNROLL_THRESHOLD-1, %eax
   101  
   102  	movl	%edx, VAR_RETVAL
   103  	jae	L(unroll)
   104  
   105  
   106  	movd	%ecx, %mm7
   107  	movl	%eax, %ecx
   108  
   109  	movl	PARAM_DST, %eax
   110  
   111  L(simple):
   112  	C eax	dst
   113  	C ebx	src
   114  	C ecx	counter, size-1 to 1
   115  	C edx	retval
   116  	C
   117  	C mm0	scratch
   118  	C mm6	shift
   119  	C mm7	32-shift
   120  
   121  	movq	-4(%ebx,%ecx,4), %mm0
   122  
   123  	psrlq	%mm7, %mm0
   124  
   125  Zdisp(	movd,	%mm0, 0,(%eax,%ecx,4))
   126  	loop	L(simple)
   127  
   128  
   129  	movd	(%ebx), %mm0
   130  	popl	%ebx
   131  
   132  	psllq	%mm6, %mm0
   133  
   134  	movd	%mm0, (%eax)
   135  	movl	%edx, %eax
   136  
   137  	femms
   138  	ret
   139  
   140  
   141  C -----------------------------------------------------------------------------
   142  	ALIGN(16)
   143  L(unroll):
   144  	C eax	size-1
   145  	C ebx	src
   146  	C ecx	32-shift
   147  	C edx	retval (but instead VAR_RETVAL is used)
   148  	C
   149  	C mm6	shift
   150  
   151  	addl	$32, %ecx
   152  	movl	PARAM_DST, %edx
   153  
   154  	movd	%ecx, %mm7
   155  	subl	$7, %eax			C size-8
   156  
   157  	leal	(%edx,%eax,4), %ecx		C alignment of dst
   158  
   159  	movq	32-8(%ebx,%eax,4), %mm2		C src high qword
   160  	testb	$4, %cl
   161  
   162  	jz	L(dst_aligned)
   163  	psllq	%mm6, %mm2
   164  
   165  	psrlq	$32, %mm2
   166  	decl	%eax
   167  
   168  	movd	%mm2, 32(%edx,%eax,4)		C dst high limb
   169  	movq	32-8(%ebx,%eax,4), %mm2		C new src high qword
   170  L(dst_aligned):
   171  
   172  	movq	32-16(%ebx,%eax,4), %mm0	C src second highest qword
   173  
   174  
   175  	C This loop is the important bit, the rest is just support for it.
   176  	C Four src limbs are held at the start, and four more will be read.
   177  	C Four dst limbs will be written.  This schedule seems necessary for
   178  	C full speed.
   179  	C
   180  	C The use of size-8 lets the loop stop when %eax goes negative and
   181  	C leaves -4 to -1 which can be tested with test $1 and $2.
   182  
   183  L(top):
   184  	C eax	counter, size-8 step by -4 until <0
   185  	C ebx	src
   186  	C ecx
   187  	C edx	dst
   188  	C
   189  	C mm0	src next qword
   190  	C mm1	scratch
   191  	C mm2	src prev qword
   192  	C mm6	shift
   193  	C mm7	64-shift
   194  
   195  	psllq	%mm6, %mm2
   196  	subl	$4, %eax
   197  
   198  	movq	%mm0, %mm1
   199  	psrlq	%mm7, %mm0
   200  
   201  	por	%mm0, %mm2
   202  	movq	24(%ebx,%eax,4), %mm0
   203  
   204  	psllq	%mm6, %mm1
   205  	movq	%mm2, 40(%edx,%eax,4)
   206  
   207  	movq	%mm0, %mm2
   208  	psrlq	%mm7, %mm0
   209  
   210  	por	%mm0, %mm1
   211  	movq	16(%ebx,%eax,4), %mm0
   212  
   213  	movq	%mm1, 32(%edx,%eax,4)
   214  	jnc	L(top)
   215  
   216  
   217  	C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
   218  	C
   219  	C 8(%ebx) is the next source, and 24(%edx) is the next destination.
   220  	C %eax is between -4 and -1, representing respectively 0 to 3 extra
   221  	C limbs that must be read.
   222  
   223  
   224  	testl	$2, %eax	C testl to avoid bad cache line crossing
   225  	jz	L(finish_nottwo)
   226  
   227  	C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
   228  	C new mm2 and a new mm0 is loaded.
   229  
   230  	psllq	%mm6, %mm2
   231  	movq	%mm0, %mm1
   232  
   233  	psrlq	%mm7, %mm0
   234  	subl	$2, %eax
   235  
   236  	por	%mm0, %mm2
   237  	movq	16(%ebx,%eax,4), %mm0
   238  
   239  	movq	%mm2, 32(%edx,%eax,4)
   240  	movq	%mm1, %mm2
   241  L(finish_nottwo):
   242  
   243  
   244  	C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
   245  
   246  	testb	$1, %al
   247  	psllq	%mm6, %mm2
   248  
   249  	movq	%mm0, %mm1
   250  	psrlq	%mm7, %mm0
   251  
   252  	por	%mm0, %mm2
   253  	psllq	%mm6, %mm1
   254  
   255  	movq	%mm2, 24(%edx,%eax,4)
   256  	jz	L(finish_even)
   257  
   258  
   259  	C Size is odd, so mm1 and one extra limb to process.
   260  
   261  	movd	(%ebx), %mm0		C src[0]
   262  	popl	%ebx
   263  deflit(`FRAME',0)
   264  
   265  	movq	%mm0, %mm2
   266  	psllq	$32, %mm0
   267  
   268  	psrlq	%mm7, %mm0
   269  
   270  	psllq	%mm6, %mm2
   271  	por	%mm0, %mm1
   272  
   273  	movq	%mm1, 4(%edx)		C dst[1,2]
   274  	movd	%mm2, (%edx)		C dst[0]
   275  
   276  	movl	VAR_RETVAL, %eax
   277  
   278  	femms
   279  	ret
   280  
   281  
   282  	nop	C avoid bad cache line crossing
   283  L(finish_even):
   284  deflit(`FRAME',4)
   285  	C Size is even, so only mm1 left to process.
   286  
   287  	movq	%mm1, (%edx)		C dst[0,1]
   288  	movl	VAR_RETVAL, %eax
   289  
   290  	popl	%ebx
   291  	femms
   292  	ret
   293  
   294  EPILOGUE()