github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/k62mmx/rshift.asm (about)

     1  dnl  AMD K6-2 mpn_rshift -- mpn right shift.
     2  
     3  dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6-2: 1.75 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    38  C                       unsigned shift);
    39  C
    40  
    41  defframe(PARAM_SHIFT,16)
    42  defframe(PARAM_SIZE, 12)
    43  defframe(PARAM_SRC,  8)
    44  defframe(PARAM_DST,  4)
    45  deflit(`FRAME',0)
    46  
    47  dnl  Minimum 9, because the unrolled loop can't handle less.
    48  dnl
    49  deflit(UNROLL_THRESHOLD, 9)
    50  
    51  	TEXT
    52  	ALIGN(32)
    53  
    54  PROLOGUE(mpn_rshift)
    55  deflit(`FRAME',0)
    56  
    57  	C The 1 limb case can be done without the push %ebx, but it's then
    58  	C still the same speed.  The push is left as a free helping hand for
    59  	C the two_or_more code.
    60  
    61  	movl	PARAM_SIZE, %eax
    62  	pushl	%ebx			FRAME_pushl()
    63  
    64  	movl	PARAM_SRC, %ebx
    65  	decl	%eax
    66  
    67  	movl	PARAM_SHIFT, %ecx
    68  	jnz	L(two_or_more)
    69  
    70  	movl	(%ebx), %edx		C src limb
    71  	movl	PARAM_DST, %ebx
    72  
    73  	shrdl(	%cl, %edx, %eax)	C return value
    74  
    75  	shrl	%cl, %edx
    76  
    77  	movl	%edx, (%ebx)		C dst limb
    78  	popl	%ebx
    79  
    80  	ret
    81  
    82  
    83  C -----------------------------------------------------------------------------
    84  	ALIGN(16)	C avoid offset 0x1f
    85  L(two_or_more):
    86  	C eax	size-1
    87  	C ebx	src
    88  	C ecx	shift
    89  	C edx
    90  
    91  	movl	(%ebx), %edx	C src low limb
    92  	negl	%ecx
    93  
    94  	addl	$32, %ecx
    95  	movd	PARAM_SHIFT, %mm6
    96  
    97  	shll	%cl, %edx
    98  	cmpl	$UNROLL_THRESHOLD-1, %eax
    99  
   100  	jae	L(unroll)
   101  
   102  
   103  	C eax	size-1
   104  	C ebx	src
   105  	C ecx	32-shift
   106  	C edx	retval
   107  	C
   108  	C mm6	shift
   109  
   110  	movl	PARAM_DST, %ecx
   111  	leal	(%ebx,%eax,4), %ebx
   112  
   113  	leal	-4(%ecx,%eax,4), %ecx
   114  	negl	%eax
   115  
   116  	C This loop runs at about 3 cycles/limb, which is the amount of
   117  	C decoding, and this is despite every second access being unaligned.
   118  
   119  L(simple):
   120  	C eax	counter, -(size-1) to -1
   121  	C ebx	&src[size-1]
   122  	C ecx	&dst[size-1]
   123  	C edx	retval
   124  	C
   125  	C mm0	scratch
   126  	C mm6	shift
   127  
   128  Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
   129  	incl	%eax
   130  
   131  	psrlq	%mm6, %mm0
   132  
   133  Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
   134  	jnz	L(simple)
   135  
   136  
   137  	movq	%mm0, (%ecx)
   138  	movl	%edx, %eax
   139  
   140  	popl	%ebx
   141  
   142  	femms
   143  	ret
   144  
   145  
   146  C -----------------------------------------------------------------------------
   147  	ALIGN(16)
   148  L(unroll):
   149  	C eax	size-1
   150  	C ebx	src
   151  	C ecx	32-shift
   152  	C edx	retval
   153  	C
   154  	C mm6	shift
   155  
   156  	addl	$32, %ecx
   157  	subl	$7, %eax		C size-8
   158  
   159  	movd	%ecx, %mm7
   160  	movl	PARAM_DST, %ecx
   161  
   162  	movq	(%ebx), %mm2		C src low qword
   163  	leal	(%ebx,%eax,4), %ebx	C src end - 32
   164  
   165  	testb	$4, %cl
   166  	leal	(%ecx,%eax,4), %ecx	C dst end - 32
   167  
   168  	notl	%eax			C -(size-7)
   169  	jz	L(dst_aligned)
   170  
   171  	psrlq	%mm6, %mm2
   172  	incl	%eax
   173  
   174  Zdisp(	movd,	%mm2, 0,(%ecx,%eax,4))	C dst low limb
   175  	movq	4(%ebx,%eax,4), %mm2	C new src low qword
   176  L(dst_aligned):
   177  
   178  	movq	12(%ebx,%eax,4), %mm0	C src second lowest qword
   179  	nop	C avoid bad cache line crossing
   180  
   181  
   182  	C This loop is the important bit, the rest is just support for it.
   183  	C Four src limbs are held at the start, and four more will be read.
   184  	C Four dst limbs will be written.  This schedule seems necessary for
   185  	C full speed.
   186  	C
   187  	C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
   188  	C and leaves 0 to 3 which can be tested with test $1 and $2.
   189  
   190  L(top):
   191  	C eax	counter, -(size-7) step by +4 until >=0
   192  	C ebx	src end - 32
   193  	C ecx	dst end - 32
   194  	C edx	retval
   195  	C
   196  	C mm0	src next qword
   197  	C mm1	scratch
   198  	C mm2	src prev qword
   199  	C mm6	shift
   200  	C mm7	64-shift
   201  
   202  	psrlq	%mm6, %mm2
   203  	addl	$4, %eax
   204  
   205  	movq	%mm0, %mm1
   206  	psllq	%mm7, %mm0
   207  
   208  	por	%mm0, %mm2
   209  	movq	4(%ebx,%eax,4), %mm0
   210  
   211  	psrlq	%mm6, %mm1
   212  	movq	%mm2, -12(%ecx,%eax,4)
   213  
   214  	movq	%mm0, %mm2
   215  	psllq	%mm7, %mm0
   216  
   217  	por	%mm0, %mm1
   218  	movq	12(%ebx,%eax,4), %mm0
   219  
   220  	movq	%mm1, -4(%ecx,%eax,4)
   221  	ja	L(top)		C jump if no carry and not zero
   222  
   223  
   224  
   225  	C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
   226  	C to 3 representing respectively 3 to 0 further limbs.
   227  
   228  	testl	$2, %eax	C testl to avoid bad cache line crossings
   229  	jnz	L(finish_nottwo)
   230  
   231  	C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
   232  	C becomes new mm2 and a new mm0 is loaded.
   233  
   234  	psrlq	%mm6, %mm2
   235  	movq	%mm0, %mm1
   236  
   237  	psllq	%mm7, %mm0
   238  	addl	$2, %eax
   239  
   240  	por	%mm0, %mm2
   241  	movq	12(%ebx,%eax,4), %mm0
   242  
   243  	movq	%mm2, -4(%ecx,%eax,4)
   244  	movq	%mm1, %mm2
   245  L(finish_nottwo):
   246  
   247  
   248  	testb	$1, %al
   249  	psrlq	%mm6, %mm2
   250  
   251  	movq	%mm0, %mm1
   252  	psllq	%mm7, %mm0
   253  
   254  	por	%mm0, %mm2
   255  	psrlq	%mm6, %mm1
   256  
   257  	movq	%mm2, 4(%ecx,%eax,4)
   258  	jnz	L(finish_even)
   259  
   260  
   261  	C one further extra limb to process
   262  
   263  	movd	32-4(%ebx), %mm0	C src[size-1], most significant limb
   264  	popl	%ebx
   265  
   266  	movq	%mm0, %mm2
   267  	psllq	%mm7, %mm0
   268  
   269  	por	%mm0, %mm1
   270  	psrlq	%mm6, %mm2
   271  
   272  	movq	%mm1, 32-12(%ecx)	C dst[size-3,size-2]
   273  	movd	%mm2, 32-4(%ecx)	C dst[size-1]
   274  
   275  	movl	%edx, %eax		C retval
   276  
   277  	femms
   278  	ret
   279  
   280  
   281  	nop	C avoid bad cache line crossing
   282  L(finish_even):
   283  	C no further extra limbs
   284  
   285  	movq	%mm1, 32-8(%ecx)	C dst[size-2,size-1]
   286  	movl	%edx, %eax		C retval
   287  
   288  	popl	%ebx
   289  
   290  	femms
   291  	ret
   292  
   293  EPILOGUE()