github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/mmx/rshift.asm (about)

     1  dnl  Intel P5 mpn_rshift -- mpn right shift.
     2  
     3  dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P5: 1.75 cycles/limb.
    35  
    36  
    37  C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    38  C                       unsigned shift);
    39  C
    40  C Shift src,size right by shift many bits and store the result in dst,size.
    41  C Zeros are shifted in at the left.  Return the bits shifted out at the
    42  C right.
    43  C
    44  C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
    45  C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
    46  C
    47  C Full speed depends on source and destination being aligned.  Unaligned mmx
    48  C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
    49  C setups and finish-ups are done to ensure alignment for the loop.
    50  C
    51  C MMX shifts work out a bit faster even for the simple loop.
    52  
    53  defframe(PARAM_SHIFT,16)
    54  defframe(PARAM_SIZE, 12)
    55  defframe(PARAM_SRC,  8)
    56  defframe(PARAM_DST,  4)
    57  deflit(`FRAME',0)
    58  
    59  dnl  Minimum 5, because the unrolled loop can't handle less.
    60  deflit(UNROLL_THRESHOLD, 5)
    61  
    62  	TEXT
    63  	ALIGN(8)
    64  
    65  PROLOGUE(mpn_rshift)
    66  
    67  	pushl	%ebx
    68  	pushl	%edi
    69  deflit(`FRAME',8)
    70  
    71  	movl	PARAM_SIZE, %eax
    72  	movl	PARAM_DST, %edx
    73  
    74  	movl	PARAM_SRC, %ebx
    75  	movl	PARAM_SHIFT, %ecx
    76  
    77  	cmp	$UNROLL_THRESHOLD, %eax
    78  	jae	L(unroll)
    79  
    80  	decl	%eax
    81  	movl	(%ebx), %edi		C src low limb
    82  
    83  	jnz	L(simple)
    84  
    85  	shrdl(	%cl, %edi, %eax)	C eax was decremented to zero
    86  
    87  	shrl	%cl, %edi
    88  
    89  	movl	%edi, (%edx)		C dst low limb
    90  	popl	%edi			C risk of data cache bank clash
    91  
    92  	popl	%ebx
    93  
    94  	ret
    95  
    96  
    97  C -----------------------------------------------------------------------------
    98  	ALIGN(8)
    99  L(simple):
   100  	C eax	size-1
   101  	C ebx	src
   102  	C ecx	shift
   103  	C edx	dst
   104  	C esi
   105  	C edi
   106  	C ebp
   107  deflit(`FRAME',8)
   108  
   109  	movd	(%ebx), %mm5		C src[0]
   110  	leal	(%ebx,%eax,4), %ebx	C &src[size-1]
   111  
   112  	movd	%ecx, %mm6		C rshift
   113  	leal	-4(%edx,%eax,4), %edx	C &dst[size-2]
   114  
   115  	psllq	$32, %mm5
   116  	negl	%eax
   117  
   118  
   119  C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
   120  C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
   121  C cycles and would be 8 in a simple loop.  Using mmx helps the return value
   122  C and last limb calculations too.
   123  
   124  L(simple_top):
   125  	C eax	counter, limbs, negative
   126  	C ebx	&src[size-1]
   127  	C ecx	return value
   128  	C edx	&dst[size-2]
   129  	C
   130  	C mm0	scratch
   131  	C mm5	return value
   132  	C mm6	shift
   133  
   134  	movq	(%ebx,%eax,4), %mm0
   135  	incl	%eax
   136  
   137  	psrlq	%mm6, %mm0
   138  
   139  	movd	%mm0, (%edx,%eax,4)
   140  	jnz	L(simple_top)
   141  
   142  
   143  	movd	(%ebx), %mm0
   144  	psrlq	%mm6, %mm5		C return value
   145  
   146  	psrlq	%mm6, %mm0
   147  	popl	%edi
   148  
   149  	movd	%mm5, %eax
   150  	popl	%ebx
   151  
   152  	movd	%mm0, 4(%edx)
   153  
   154  	emms
   155  
   156  	ret
   157  
   158  
   159  C -----------------------------------------------------------------------------
   160  	ALIGN(8)
   161  L(unroll):
   162  	C eax	size
   163  	C ebx	src
   164  	C ecx	shift
   165  	C edx	dst
   166  	C esi
   167  	C edi
   168  	C ebp
   169  deflit(`FRAME',8)
   170  
   171  	movd	(%ebx), %mm5		C src[0]
   172  	movl	$4, %edi
   173  
   174  	movd	%ecx, %mm6		C rshift
   175  	testl	%edi, %ebx
   176  
   177  	psllq	$32, %mm5
   178  	jz	L(start_src_aligned)
   179  
   180  
   181  	C src isn't aligned, process low limb separately (marked xxx) and
   182  	C step src and dst by one limb, making src aligned.
   183  	C
   184  	C source                  ebx
   185  	C --+-------+-------+-------+
   186  	C           |          xxx  |
   187  	C --+-------+-------+-------+
   188  	C         4mod8   0mod8   4mod8
   189  	C
   190  	C         dest            edx
   191  	C         --+-------+-------+
   192  	C           |       |  xxx  |
   193  	C         --+-------+-------+
   194  
   195  	movq	(%ebx), %mm0		C unaligned load
   196  
   197  	psrlq	%mm6, %mm0
   198  	addl	$4, %ebx
   199  
   200  	decl	%eax
   201  
   202  	movd	%mm0, (%edx)
   203  	addl	$4, %edx
   204  L(start_src_aligned):
   205  
   206  
   207  	movq	(%ebx), %mm1
   208  	testl	%edi, %edx
   209  
   210  	psrlq	%mm6, %mm5		C retval
   211  	jz	L(start_dst_aligned)
   212  
   213  	C dst isn't aligned, add 4 to make it so, and pretend the shift is
   214  	C 32 bits extra.  Low limb of dst (marked xxx) handled here
   215  	C separately.
   216  	C
   217  	C          source          ebx
   218  	C          --+-------+-------+
   219  	C            |      mm1      |
   220  	C          --+-------+-------+
   221  	C                  4mod8   0mod8
   222  	C
   223  	C  dest                    edx
   224  	C  --+-------+-------+-------+
   225  	C                    |  xxx  |
   226  	C  --+-------+-------+-------+
   227  	C          4mod8   0mod8   4mod8
   228  
   229  	movq	%mm1, %mm0
   230  	addl	$32, %ecx		C new shift
   231  
   232  	psrlq	%mm6, %mm0
   233  
   234  	movd	%ecx, %mm6
   235  
   236  	movd	%mm0, (%edx)
   237  	addl	$4, %edx
   238  L(start_dst_aligned):
   239  
   240  
   241  	movq	8(%ebx), %mm3
   242  	negl	%ecx
   243  
   244  	movq	%mm3, %mm2		C mm2 src qword
   245  	addl	$64, %ecx
   246  
   247  	movd	%ecx, %mm7
   248  	psrlq	%mm6, %mm1
   249  
   250  	leal	-12(%ebx,%eax,4), %ebx
   251  	leal	-20(%edx,%eax,4), %edx
   252  
   253  	psllq	%mm7, %mm3
   254  	subl	$7, %eax		C size-7
   255  
   256  	por	%mm1, %mm3		C mm3 ready to store
   257  	negl	%eax			C -(size-7)
   258  
   259  	jns	L(finish)
   260  
   261  
   262  	C This loop is the important bit, the rest is just support.  Careful
   263  	C instruction scheduling achieves the claimed 1.75 c/l.  The
   264  	C relevant parts of the pairing rules are:
   265  	C
   266  	C - mmx loads and stores execute only in the U pipe
   267  	C - only one mmx shift in a pair
   268  	C - wait one cycle before storing an mmx register result
   269  	C - the usual address generation interlock
   270  	C
   271  	C Two qword calculations are slightly interleaved.  The instructions
   272  	C marked "C" belong to the second qword, and the "C prev" one is for
   273  	C the second qword from the previous iteration.
   274  
   275  	ALIGN(8)
   276  L(unroll_loop):
   277  	C eax	counter, limbs, negative
   278  	C ebx	&src[size-12]
   279  	C ecx
   280  	C edx	&dst[size-12]
   281  	C esi
   282  	C edi
   283  	C
   284  	C mm0
   285  	C mm1
   286  	C mm2	src qword from -8(%ebx,%eax,4)
   287  	C mm3	dst qword ready to store to -8(%edx,%eax,4)
   288  	C
   289  	C mm5	return value
   290  	C mm6	rshift
   291  	C mm7	lshift
   292  
   293  	movq	(%ebx,%eax,4), %mm0
   294  	psrlq	%mm6, %mm2
   295  
   296  	movq	%mm0, %mm1
   297  	psllq	%mm7, %mm0
   298  
   299  	movq	%mm3, -8(%edx,%eax,4)	C prev
   300  	por	%mm2, %mm0
   301  
   302  	movq	8(%ebx,%eax,4), %mm3	C
   303  	psrlq	%mm6, %mm1		C
   304  
   305  	movq	%mm0, (%edx,%eax,4)
   306  	movq	%mm3, %mm2		C
   307  
   308  	psllq	%mm7, %mm3		C
   309  	addl	$4, %eax
   310  
   311  	por	%mm1, %mm3		C
   312  	js	L(unroll_loop)
   313  
   314  
   315  L(finish):
   316  	C eax	0 to 3 representing respectively 3 to 0 limbs remaining
   317  
   318  	testb	$2, %al
   319  
   320  	jnz	L(finish_no_two)
   321  
   322  	movq	(%ebx,%eax,4), %mm0
   323  	psrlq	%mm6, %mm2
   324  
   325  	movq	%mm0, %mm1
   326  	psllq	%mm7, %mm0
   327  
   328  	movq	%mm3, -8(%edx,%eax,4)	C prev
   329  	por	%mm2, %mm0
   330  
   331  	movq	%mm1, %mm2
   332  	movq	%mm0, %mm3
   333  
   334  	addl	$2, %eax
   335  L(finish_no_two):
   336  
   337  
   338  	C eax	2 or 3 representing respectively 1 or 0 limbs remaining
   339  	C
   340  	C mm2	src prev qword, from -8(%ebx,%eax,4)
   341  	C mm3	dst qword, for -8(%edx,%eax,4)
   342  
   343  	testb	$1, %al
   344  	popl	%edi
   345  
   346  	movd	%mm5, %eax	C retval
   347  	jnz	L(finish_zero)
   348  
   349  
   350  	C One extra limb, destination was aligned.
   351  	C
   352  	C source                ebx
   353  	C +-------+---------------+--
   354  	C |       |      mm2      |
   355  	C +-------+---------------+--
   356  	C
   357  	C dest                                  edx
   358  	C +-------+---------------+---------------+--
   359  	C |       |               |      mm3      |
   360  	C +-------+---------------+---------------+--
   361  	C
   362  	C mm6 = shift
   363  	C mm7 = ecx = 64-shift
   364  
   365  
   366  	C One extra limb, destination was unaligned.
   367  	C
   368  	C source                ebx
   369  	C +-------+---------------+--
   370  	C |       |      mm2      |
   371  	C +-------+---------------+--
   372  	C
   373  	C dest                          edx
   374  	C +---------------+---------------+--
   375  	C |               |      mm3      |
   376  	C +---------------+---------------+--
   377  	C
   378  	C mm6 = shift+32
   379  	C mm7 = ecx = 64-(shift+32)
   380  
   381  
   382  	C In both cases there's one extra limb of src to fetch and combine
   383  	C with mm2 to make a qword at 8(%edx), and in the aligned case
   384  	C there's a further extra limb of dst to be formed.
   385  
   386  
   387  	movd	8(%ebx), %mm0
   388  	psrlq	%mm6, %mm2
   389  
   390  	movq	%mm0, %mm1
   391  	psllq	%mm7, %mm0
   392  
   393  	movq	%mm3, (%edx)
   394  	por	%mm2, %mm0
   395  
   396  	psrlq	%mm6, %mm1
   397  	andl	$32, %ecx
   398  
   399  	popl	%ebx
   400  	jz	L(finish_one_unaligned)
   401  
   402  	C dst was aligned, must store one extra limb
   403  	movd	%mm1, 16(%edx)
   404  L(finish_one_unaligned):
   405  
   406  	movq	%mm0, 8(%edx)
   407  
   408  	emms
   409  
   410  	ret
   411  
   412  
   413  L(finish_zero):
   414  
   415  	C No extra limbs, destination was aligned.
   416  	C
   417  	C source        ebx
   418  	C +---------------+--
   419  	C |      mm2      |
   420  	C +---------------+--
   421  	C
   422  	C dest                        edx+4
   423  	C +---------------+---------------+--
   424  	C |               |      mm3      |
   425  	C +---------------+---------------+--
   426  	C
   427  	C mm6 = shift
   428  	C mm7 = ecx = 64-shift
   429  
   430  
   431  	C No extra limbs, destination was unaligned.
   432  	C
   433  	C source        ebx
   434  	C +---------------+--
   435  	C |      mm2      |
   436  	C +---------------+--
   437  	C
   438  	C dest                edx+4
   439  	C +-------+---------------+--
   440  	C |       |      mm3      |
   441  	C +-------+---------------+--
   442  	C
   443  	C mm6 = shift+32
   444  	C mm7 = 64-(shift+32)
   445  
   446  
   447  	C The movd for the unaligned case is clearly the same data as the
   448  	C movq for the aligned case, it's just a choice between whether one
   449  	C or two limbs should be written.
   450  
   451  
   452  	movq	%mm3, 4(%edx)
   453  	psrlq	%mm6, %mm2
   454  
   455  	movd	%mm2, 12(%edx)
   456  	andl	$32, %ecx
   457  
   458  	popl	%ebx
   459  	jz	L(finish_zero_unaligned)
   460  
   461  	movq	%mm2, 12(%edx)
   462  L(finish_zero_unaligned):
   463  
   464  	emms
   465  
   466  	ret
   467  
   468  EPILOGUE()