github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/mmx/lshift.asm (about)

     1  dnl  Intel P5 mpn_lshift -- mpn left shift.
     2  
     3  dnl  Copyright 2000-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P5: 1.75 cycles/limb.
    35  
    36  
    37  C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    38  C                       unsigned shift);
    39  C
    40  C Shift src,size left by shift many bits and store the result in dst,size.
    41  C Zeros are shifted in at the right.  Return the bits shifted out at the
    42  C left.
    43  C
    44  C The comments in mpn_rshift apply here too.
    45  
    46  defframe(PARAM_SHIFT,16)
    47  defframe(PARAM_SIZE, 12)
    48  defframe(PARAM_SRC,  8)
    49  defframe(PARAM_DST,  4)
    50  deflit(`FRAME',0)
    51  
    52  dnl  minimum 5, because the unrolled loop can't handle less
    53  deflit(UNROLL_THRESHOLD, 5)
    54  
    55  	TEXT
    56  	ALIGN(8)
    57  
    58  PROLOGUE(mpn_lshift)
    59  
    60  	pushl	%ebx
    61  	pushl	%edi
    62  deflit(`FRAME',8)
    63  
    64  	movl	PARAM_SIZE, %eax
    65  	movl	PARAM_DST, %edx
    66  
    67  	movl	PARAM_SRC, %ebx
    68  	movl	PARAM_SHIFT, %ecx
    69  
    70  	cmp	$UNROLL_THRESHOLD, %eax
    71  	jae	L(unroll)
    72  
    73  	movl	-4(%ebx,%eax,4), %edi	C src high limb
    74  	decl	%eax
    75  
    76  	jnz	L(simple)
    77  
    78  	shldl(	%cl, %edi, %eax)	C eax was decremented to zero
    79  
    80  	shll	%cl, %edi
    81  
    82  	movl	%edi, (%edx)		C dst low limb
    83  	popl	%edi			C risk of data cache bank clash
    84  
    85  	popl	%ebx
    86  
    87  	ret
    88  
    89  
    90  C -----------------------------------------------------------------------------
    91  L(simple):
    92  	C eax	size-1
    93  	C ebx	src
    94  	C ecx	shift
    95  	C edx	dst
    96  	C esi
    97  	C edi
    98  	C ebp
    99  deflit(`FRAME',8)
   100  
   101  	movd	(%ebx,%eax,4), %mm5	C src high limb
   102  
   103  	movd	%ecx, %mm6		C lshift
   104  	negl	%ecx
   105  
   106  	psllq	%mm6, %mm5
   107  	addl	$32, %ecx
   108  
   109  	movd	%ecx, %mm7
   110  	psrlq	$32, %mm5		C retval
   111  
   112  
   113  L(simple_top):
   114  	C eax	counter, limbs, negative
   115  	C ebx	src
   116  	C ecx
   117  	C edx	dst
   118  	C esi
   119  	C edi
   120  	C
   121  	C mm0	scratch
   122  	C mm5	return value
   123  	C mm6	shift
   124  	C mm7	32-shift
   125  
   126  	movq	-4(%ebx,%eax,4), %mm0
   127  	decl	%eax
   128  
   129  	psrlq	%mm7, %mm0
   130  
   131  	C
   132  
   133  	movd	%mm0, 4(%edx,%eax,4)
   134  	jnz	L(simple_top)
   135  
   136  
   137  	movd	(%ebx), %mm0
   138  
   139  	movd	%mm5, %eax
   140  	psllq	%mm6, %mm0
   141  
   142  	popl	%edi
   143  	popl	%ebx
   144  
   145  	movd	%mm0, (%edx)
   146  
   147  	emms
   148  
   149  	ret
   150  
   151  
   152  C -----------------------------------------------------------------------------
   153  	ALIGN(8)
   154  L(unroll):
   155  	C eax	size
   156  	C ebx	src
   157  	C ecx	shift
   158  	C edx	dst
   159  	C esi
   160  	C edi
   161  	C ebp
   162  deflit(`FRAME',8)
   163  
   164  	movd	-4(%ebx,%eax,4), %mm5	C src high limb
   165  	leal	(%ebx,%eax,4), %edi
   166  
   167  	movd	%ecx, %mm6		C lshift
   168  	andl	$4, %edi
   169  
   170  	psllq	%mm6, %mm5
   171  	jz	L(start_src_aligned)
   172  
   173  
   174  	C src isn't aligned, process high limb separately (marked xxx) to
   175  	C make it so.
   176  	C
   177  	C  source     -8(ebx,%eax,4)
   178  	C                  |
   179  	C  +-------+-------+-------+--
   180  	C  |               |
   181  	C  +-------+-------+-------+--
   182  	C        0mod8   4mod8   0mod8
   183  	C
   184  	C  dest
   185  	C     -4(edx,%eax,4)
   186  	C          |
   187  	C  +-------+-------+--
   188  	C  |  xxx  |       |
   189  	C  +-------+-------+--
   190  
   191  	movq	-8(%ebx,%eax,4), %mm0	C unaligned load
   192  
   193  	psllq	%mm6, %mm0
   194  	decl	%eax
   195  
   196  	psrlq	$32, %mm0
   197  
   198  	C
   199  
   200  	movd	%mm0, (%edx,%eax,4)
   201  L(start_src_aligned):
   202  
   203  	movq	-8(%ebx,%eax,4), %mm1	C src high qword
   204  	leal	(%edx,%eax,4), %edi
   205  
   206  	andl	$4, %edi
   207  	psrlq	$32, %mm5		C return value
   208  
   209  	movq	-16(%ebx,%eax,4), %mm3	C src second highest qword
   210  	jz	L(start_dst_aligned)
   211  
   212  	C dst isn't aligned, subtract 4 to make it so, and pretend the shift
   213  	C is 32 bits extra.  High limb of dst (marked xxx) handled here
   214  	C separately.
   215  	C
   216  	C  source     -8(ebx,%eax,4)
   217  	C                  |
   218  	C  +-------+-------+--
   219  	C  |      mm1      |
   220  	C  +-------+-------+--
   221  	C                0mod8   4mod8
   222  	C
   223  	C  dest
   224  	C     -4(edx,%eax,4)
   225  	C          |
   226  	C  +-------+-------+-------+--
   227  	C  |  xxx  |               |
   228  	C  +-------+-------+-------+--
   229  	C        0mod8   4mod8   0mod8
   230  
   231  	movq	%mm1, %mm0
   232  	addl	$32, %ecx		C new shift
   233  
   234  	psllq	%mm6, %mm0
   235  
   236  	movd	%ecx, %mm6
   237  	psrlq	$32, %mm0
   238  
   239  	C wasted cycle here waiting for %mm0
   240  
   241  	movd	%mm0, -4(%edx,%eax,4)
   242  	subl	$4, %edx
   243  L(start_dst_aligned):
   244  
   245  
   246  	psllq	%mm6, %mm1
   247  	negl	%ecx			C -shift
   248  
   249  	addl	$64, %ecx		C 64-shift
   250  	movq	%mm3, %mm2
   251  
   252  	movd	%ecx, %mm7
   253  	subl	$8, %eax		C size-8
   254  
   255  	psrlq	%mm7, %mm3
   256  
   257  	por	%mm1, %mm3		C mm3 ready to store
   258  	jc	L(finish)
   259  
   260  
   261  	C The comments in mpn_rshift apply here too.
   262  
   263  	ALIGN(8)
   264  L(unroll_loop):
   265  	C eax	counter, limbs
   266  	C ebx	src
   267  	C ecx
   268  	C edx	dst
   269  	C esi
   270  	C edi
   271  	C
   272  	C mm0
   273  	C mm1
   274  	C mm2	src qword from 16(%ebx,%eax,4)
   275  	C mm3	dst qword ready to store to 24(%edx,%eax,4)
   276  	C
   277  	C mm5	return value
   278  	C mm6	lshift
   279  	C mm7	rshift
   280  
   281  	movq	8(%ebx,%eax,4), %mm0
   282  	psllq	%mm6, %mm2
   283  
   284  	movq	%mm0, %mm1
   285  	psrlq	%mm7, %mm0
   286  
   287  	movq	%mm3, 24(%edx,%eax,4)	C prev
   288  	por	%mm2, %mm0
   289  
   290  	movq	(%ebx,%eax,4), %mm3	C
   291  	psllq	%mm6, %mm1		C
   292  
   293  	movq	%mm0, 16(%edx,%eax,4)
   294  	movq	%mm3, %mm2		C
   295  
   296  	psrlq	%mm7, %mm3		C
   297  	subl	$4, %eax
   298  
   299  	por	%mm1, %mm3		C
   300  	jnc	L(unroll_loop)
   301  
   302  
   303  
   304  L(finish):
   305  	C eax	-4 to -1 representing respectively 0 to 3 limbs remaining
   306  
   307  	testb	$2, %al
   308  
   309  	jz	L(finish_no_two)
   310  
   311  	movq	8(%ebx,%eax,4), %mm0
   312  	psllq	%mm6, %mm2
   313  
   314  	movq	%mm0, %mm1
   315  	psrlq	%mm7, %mm0
   316  
   317  	movq	%mm3, 24(%edx,%eax,4)	C prev
   318  	por	%mm2, %mm0
   319  
   320  	movq	%mm1, %mm2
   321  	movq	%mm0, %mm3
   322  
   323  	subl	$2, %eax
   324  L(finish_no_two):
   325  
   326  
   327  	C eax	-4 or -3 representing respectively 0 or 1 limbs remaining
   328  	C
   329  	C mm2	src prev qword, from 16(%ebx,%eax,4)
   330  	C mm3	dst qword, for 24(%edx,%eax,4)
   331  
   332  	testb	$1, %al
   333  	movd	%mm5, %eax	C retval
   334  
   335  	popl	%edi
   336  	jz	L(finish_zero)
   337  
   338  
   339  	C One extra src limb, destination was aligned.
   340  	C
   341  	C                 source                  ebx
   342  	C                 --+---------------+-------+
   343  	C                   |      mm2      |       |
   344  	C                 --+---------------+-------+
   345  	C
   346  	C dest         edx+12           edx+4     edx
   347  	C --+---------------+---------------+-------+
   348  	C   |      mm3      |               |       |
   349  	C --+---------------+---------------+-------+
   350  	C
   351  	C mm6 = shift
   352  	C mm7 = ecx = 64-shift
   353  
   354  
   355  	C One extra src limb, destination was unaligned.
   356  	C
   357  	C                 source                  ebx
   358  	C                 --+---------------+-------+
   359  	C                   |      mm2      |       |
   360  	C                 --+---------------+-------+
   361  	C
   362  	C         dest         edx+12           edx+4
   363  	C         --+---------------+---------------+
   364  	C           |      mm3      |               |
   365  	C         --+---------------+---------------+
   366  	C
   367  	C mm6 = shift+32
   368  	C mm7 = ecx = 64-(shift+32)
   369  
   370  
   371  	C In both cases there's one extra limb of src to fetch and combine
   372  	C with mm2 to make a qword at 4(%edx), and in the aligned case
   373  	C there's an extra limb of dst to be formed from that extra src limb
   374  	C left shifted.
   375  
   376  
   377  	movd	(%ebx), %mm0
   378  	psllq	%mm6, %mm2
   379  
   380  	movq	%mm3, 12(%edx)
   381  	psllq	$32, %mm0
   382  
   383  	movq	%mm0, %mm1
   384  	psrlq	%mm7, %mm0
   385  
   386  	por	%mm2, %mm0
   387  	psllq	%mm6, %mm1
   388  
   389  	movq	%mm0, 4(%edx)
   390  	psrlq	$32, %mm1
   391  
   392  	andl	$32, %ecx
   393  	popl	%ebx
   394  
   395  	jz	L(finish_one_unaligned)
   396  
   397  	movd	%mm1, (%edx)
   398  L(finish_one_unaligned):
   399  
   400  	emms
   401  
   402  	ret
   403  
   404  
   405  L(finish_zero):
   406  
   407  	C No extra src limbs, destination was aligned.
   408  	C
   409  	C                 source          ebx
   410  	C                 --+---------------+
   411  	C                   |      mm2      |
   412  	C                 --+---------------+
   413  	C
   414  	C dest          edx+8             edx
   415  	C --+---------------+---------------+
   416  	C   |      mm3      |               |
   417  	C --+---------------+---------------+
   418  	C
   419  	C mm6 = shift
   420  	C mm7 = ecx = 64-shift
   421  
   422  
   423  	C No extra src limbs, destination was unaligned.
   424  	C
   425  	C               source            ebx
   426  	C                 --+---------------+
   427  	C                   |      mm2      |
   428  	C                 --+---------------+
   429  	C
   430  	C         dest          edx+8   edx+4
   431  	C         --+---------------+-------+
   432  	C           |      mm3      |       |
   433  	C         --+---------------+-------+
   434  	C
   435  	C mm6 = shift+32
   436  	C mm7 = ecx = 64-(shift+32)
   437  
   438  
   439  	C The movd for the unaligned case writes the same data to 4(%edx)
   440  	C that the movq does for the aligned case.
   441  
   442  
   443  	movq	%mm3, 8(%edx)
   444  	andl	$32, %ecx
   445  
   446  	psllq	%mm6, %mm2
   447  	jz	L(finish_zero_unaligned)
   448  
   449  	movq	%mm2, (%edx)
   450  L(finish_zero_unaligned):
   451  
   452  	psrlq	$32, %mm2
   453  	popl	%ebx
   454  
   455  	movd	%mm5, %eax	C retval
   456  
   457  	movd	%mm2, 4(%edx)
   458  
   459  	emms
   460  
   461  	ret
   462  
   463  EPILOGUE()