github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/rshift.asm (about)

     1  dnl  AMD K7 mpn_rshift -- mpn right shift.
     2  
     3  dnl  Copyright 1999-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K7: 1.21 cycles/limb (at 16 limbs/loop).
    35  
    36  
    37  
    38  dnl  K7: UNROLL_COUNT cycles/limb
    39  dnl           4           1.51
    40  dnl           8           1.26
    41  dnl          16           1.21
    42  dnl          32           1.2
    43  dnl  Maximum possible with the current code is 64.
    44  
    45  deflit(UNROLL_COUNT, 16)
    46  
    47  
    48  C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    49  C                       unsigned shift);
    50  C
    51  C Shift src,size right by shift many bits and store the result in dst,size.
    52  C Zeros are shifted in at the left.  The bits shifted out at the right are
    53  C the return value.
    54  C
    55  C This code uses 64-bit MMX operations, which makes it possible to handle
    56  C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
    57  C code, on the other hand, suffers from shrd being a vector path decode and
    58  C running at 3 cycles back-to-back.
    59  C
    60  C Full speed depends on source and destination being aligned, and some hairy
    61  C setups and finish-ups are done to arrange this for the loop.
    62  
    63  ifdef(`PIC',`
    64  deflit(UNROLL_THRESHOLD, 10)
    65  ',`
    66  deflit(UNROLL_THRESHOLD, 10)
    67  ')
    68  
    69  defframe(PARAM_SHIFT,16)
    70  defframe(PARAM_SIZE, 12)
    71  defframe(PARAM_SRC,  8)
    72  defframe(PARAM_DST,  4)
    73  
    74  defframe(SAVE_EDI, -4)
    75  defframe(SAVE_ESI, -8)
    76  defframe(SAVE_EBX, -12)
    77  deflit(SAVE_SIZE, 12)
    78  
    79  	TEXT
    80  	ALIGN(32)
    81  
    82  PROLOGUE(mpn_rshift)
    83  deflit(`FRAME',0)
    84  
    85  	movl	PARAM_SIZE, %eax
    86  	movl	PARAM_SRC, %edx
    87  	subl	$SAVE_SIZE, %esp
    88  deflit(`FRAME',SAVE_SIZE)
    89  
    90  	movl	PARAM_SHIFT, %ecx
    91  	movl	%edi, SAVE_EDI
    92  
    93  	movl	PARAM_DST, %edi
    94  	decl	%eax
    95  	jnz	L(more_than_one_limb)
    96  
    97  	movl	(%edx), %edx		C src limb
    98  
    99  	shrdl(	%cl, %edx, %eax)	C eax was decremented to zero
   100  
   101  	shrl	%cl, %edx
   102  
   103  	movl	%edx, (%edi)		C dst limb
   104  	movl	SAVE_EDI, %edi
   105  	addl	$SAVE_SIZE, %esp
   106  
   107  	ret
   108  
   109  
   110  C -----------------------------------------------------------------------------
   111  L(more_than_one_limb):
   112  	C eax	size-1
   113  	C ebx
   114  	C ecx	shift
   115  	C edx	src
   116  	C esi
   117  	C edi	dst
   118  	C ebp
   119  
   120  	movd	PARAM_SHIFT, %mm6	C rshift
   121  	movd	(%edx), %mm5		C src low limb
   122  	cmp	$UNROLL_THRESHOLD-1, %eax
   123  
   124  	jae	L(unroll)
   125  	leal	(%edx,%eax,4), %edx	C &src[size-1]
   126  	leal	-4(%edi,%eax,4), %edi	C &dst[size-2]
   127  
   128  	movd	(%edx), %mm4		C src high limb
   129  	negl	%eax
   130  
   131  
   132  L(simple_top):
   133  	C eax	loop counter, limbs, negative
   134  	C ebx
   135  	C ecx	shift
   136  	C edx	carry
   137  	C edx	&src[size-1]
   138  	C edi	&dst[size-2]
   139  	C ebp
   140  	C
   141  	C mm0	scratch
   142  	C mm4	src high limb
   143  	C mm5	src low limb
   144  	C mm6	shift
   145  
   146  	movq	(%edx,%eax,4), %mm0
   147  	incl	%eax
   148  
   149  	psrlq	%mm6, %mm0
   150  
   151  	movd	%mm0, (%edi,%eax,4)
   152  	jnz	L(simple_top)
   153  
   154  
   155  	psllq	$32, %mm5
   156  	psrlq	%mm6, %mm4
   157  
   158  	psrlq	%mm6, %mm5
   159  	movd	%mm4, 4(%edi)		C dst high limb
   160  
   161  	movd	%mm5, %eax		C return value
   162  
   163  	movl	SAVE_EDI, %edi
   164  	addl	$SAVE_SIZE, %esp
   165  	emms
   166  
   167  	ret
   168  
   169  
   170  C -----------------------------------------------------------------------------
   171  	ALIGN(16)
   172  L(unroll):
   173  	C eax	size-1
   174  	C ebx
   175  	C ecx	shift
   176  	C edx	src
   177  	C esi
   178  	C edi	dst
   179  	C ebp
   180  	C
   181  	C mm5	src low limb
   182  	C mm6	rshift
   183  
   184  	testb	$4, %dl
   185  	movl	%esi, SAVE_ESI
   186  	movl	%ebx, SAVE_EBX
   187  
   188  	psllq	$32, %mm5
   189  	jz	L(start_src_aligned)
   190  
   191  
   192  	C src isn't aligned, process low limb separately (marked xxx) and
   193  	C step src and dst by one limb, making src aligned.
   194  	C
   195  	C source                  edx
   196  	C --+-------+-------+-------+
   197  	C           |          xxx  |
   198  	C --+-------+-------+-------+
   199  	C         4mod8   0mod8   4mod8
   200  	C
   201  	C         dest            edi
   202  	C         --+-------+-------+
   203  	C           |       |  xxx  |
   204  	C         --+-------+-------+
   205  
   206  	movq	(%edx), %mm0		C src low two limbs
   207  	addl	$4, %edx
   208  	movl	%eax, PARAM_SIZE	C size-1
   209  
   210  	addl	$4, %edi
   211  	decl	%eax			C size-2 is new size-1
   212  
   213  	psrlq	%mm6, %mm0
   214  	movl	%edi, PARAM_DST		C new dst
   215  
   216  	movd	%mm0, -4(%edi)
   217  L(start_src_aligned):
   218  
   219  
   220  	movq	(%edx), %mm1		C src low two limbs
   221  	decl	%eax			C size-2, two last limbs handled at end
   222  	testl	$4, %edi
   223  
   224  	psrlq	%mm6, %mm5
   225  	jz	L(start_dst_aligned)
   226  
   227  
   228  	C dst isn't aligned, add 4 to make it so, and pretend the shift is
   229  	C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
   230  	C
   231  	C          source          edx
   232  	C          --+-------+-------+
   233  	C            |      mm1      |
   234  	C          --+-------+-------+
   235  	C                  4mod8   0mod8
   236  	C
   237  	C  dest                    edi
   238  	C  --+-------+-------+-------+
   239  	C                    |  xxx  |
   240  	C  --+-------+-------+-------+
   241  	C          4mod8   0mod8   4mod8
   242  
   243  	movq	%mm1, %mm0
   244  	psrlq	%mm6, %mm1
   245  	addl	$32, %ecx		C shift+32
   246  
   247  	movd	%mm1, (%edi)
   248  	movq	%mm0, %mm1
   249  	addl	$4, %edi		C new dst
   250  
   251  	movd	%ecx, %mm6
   252  L(start_dst_aligned):
   253  
   254  
   255  	movq	%mm1, %mm2		C copy of src low two limbs
   256  	negl	%ecx
   257  	andl	$-2, %eax		C round size down to even
   258  
   259  	movl	%eax, %ebx
   260  	negl	%eax
   261  	addl	$64, %ecx
   262  
   263  	andl	$UNROLL_MASK, %eax
   264  	decl	%ebx
   265  
   266  	shll	%eax
   267  
   268  	movd	%ecx, %mm7		C lshift = 64-rshift
   269  
   270  ifdef(`PIC',`
   271  	call	L(pic_calc)
   272  L(here):
   273  ',`
   274  	leal	L(entry) (%eax,%eax,4), %esi
   275  	negl	%eax
   276  ')
   277  	shrl	$UNROLL_LOG2, %ebx	C loop counter
   278  
   279  	leal	ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
   280  	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
   281  	movl	PARAM_SIZE, %eax	C for use at end
   282  
   283  	jmp	*%esi
   284  
   285  
   286  ifdef(`PIC',`
   287  L(pic_calc):
   288  	C See mpn/x86/README about old gas bugs
   289  	leal	(%eax,%eax,4), %esi
   290  	addl	$L(entry)-L(here), %esi
   291  	addl	(%esp), %esi
   292  	negl	%eax
   293  
   294  	ret_internal
   295  ')
   296  
   297  
   298  C -----------------------------------------------------------------------------
   299  	ALIGN(64)
   300  L(top):
   301  	C eax	size, for use at end
   302  	C ebx	loop counter
   303  	C ecx	lshift
   304  	C edx	src
   305  	C esi	was computed jump
   306  	C edi	dst
   307  	C ebp
   308  	C
   309  	C mm0	scratch
   310  	C mm1	\ carry (alternating)
   311  	C mm2	/
   312  	C mm6	rshift
   313  	C mm7	lshift
   314  	C
   315  	C 10 code bytes/limb
   316  	C
   317  	C The two chunks differ in whether mm1 or mm2 hold the carry.
   318  	C The computed jump puts the initial carry in both mm1 and mm2.
   319  
   320  L(entry):
   321  deflit(CHUNK_COUNT, 4)
   322  forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
   323  	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
   324  	deflit(`disp1', eval(disp0 + 8))
   325  
   326  Zdisp(	movq,	disp0,(%edx), %mm0)
   327  	psrlq	%mm6, %mm2
   328  
   329  	movq	%mm0, %mm1
   330  	psllq	%mm7, %mm0
   331  
   332  	por	%mm2, %mm0
   333  Zdisp(	movq,	%mm0, disp0,(%edi))
   334  
   335  
   336  Zdisp(	movq,	disp1,(%edx), %mm0)
   337  	psrlq	%mm6, %mm1
   338  
   339  	movq	%mm0, %mm2
   340  	psllq	%mm7, %mm0
   341  
   342  	por	%mm1, %mm0
   343  Zdisp(	movq,	%mm0, disp1,(%edi))
   344  ')
   345  
   346  	addl	$UNROLL_BYTES, %edx
   347  	addl	$UNROLL_BYTES, %edi
   348  	decl	%ebx
   349  
   350  	jns	L(top)
   351  
   352  
   353  deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
   354  deflit(`disp1', eval(disp0-0 + 8))
   355  
   356  	testb	$1, %al
   357  	psrlq	%mm6, %mm2	C wanted rshifted in all cases below
   358  	movl	SAVE_ESI, %esi
   359  
   360  	movd	%mm5, %eax		C return value
   361  
   362  	movl	SAVE_EBX, %ebx
   363  	jz	L(end_even)
   364  
   365  
   366  	C Size odd, destination was aligned.
   367  	C
   368  	C source
   369  	C       edx
   370  	C +-------+---------------+--
   371  	C |       |      mm2      |
   372  	C +-------+---------------+--
   373  	C
   374  	C dest                  edi
   375  	C +-------+---------------+---------------+--
   376  	C |       |               |    written    |
   377  	C +-------+---------------+---------------+--
   378  	C
   379  	C mm6 = shift
   380  	C mm7 = ecx = 64-shift
   381  
   382  
   383  	C Size odd, destination was unaligned.
   384  	C
   385  	C source
   386  	C       edx
   387  	C +-------+---------------+--
   388  	C |       |      mm2      |
   389  	C +-------+---------------+--
   390  	C
   391  	C dest          edi
   392  	C +---------------+---------------+--
   393  	C |               |    written    |
   394  	C +---------------+---------------+--
   395  	C
   396  	C mm6 = shift+32
   397  	C mm7 = ecx = 64-(shift+32)
   398  
   399  
   400  	C In both cases there's one extra limb of src to fetch and combine
   401  	C with mm2 to make a qword to store, and in the aligned case there's
   402  	C a further extra limb of dst to be formed.
   403  
   404  
   405  	movd	disp0(%edx), %mm0
   406  	movq	%mm0, %mm1
   407  
   408  	psllq	%mm7, %mm0
   409  	testb	$32, %cl
   410  
   411  	por	%mm2, %mm0
   412  	psrlq	%mm6, %mm1
   413  
   414  	movq	%mm0, disp0(%edi)
   415  	jz	L(finish_odd_unaligned)
   416  
   417  	movd	%mm1, disp1(%edi)
   418  L(finish_odd_unaligned):
   419  
   420  	movl	SAVE_EDI, %edi
   421  	addl	$SAVE_SIZE, %esp
   422  	emms
   423  
   424  	ret
   425  
   426  
   427  L(end_even):
   428  
   429  	C Size even, destination was aligned.
   430  	C
   431  	C source
   432  	C +---------------+--
   433  	C |      mm2      |
   434  	C +---------------+--
   435  	C
   436  	C dest          edi
   437  	C +---------------+---------------+--
   438  	C |               |      mm3      |
   439  	C +---------------+---------------+--
   440  	C
   441  	C mm6 = shift
   442  	C mm7 = ecx = 64-shift
   443  
   444  
   445  	C Size even, destination was unaligned.
   446  	C
   447  	C source
   448  	C +---------------+--
   449  	C |      mm2      |
   450  	C +---------------+--
   451  	C
   452  	C dest  edi
   453  	C +-------+---------------+--
   454  	C |       |      mm3      |
   455  	C +-------+---------------+--
   456  	C
   457  	C mm6 = shift+32
   458  	C mm7 = 64-(shift+32)
   459  
   460  
   461  	C The movd for the unaligned case is the same data as the movq for
   462  	C the aligned case, it's just a choice between whether one or two
   463  	C limbs should be written.
   464  
   465  
   466  	testb	$32, %cl
   467  	movd	%mm2, disp0(%edi)
   468  
   469  	jz	L(end_even_unaligned)
   470  
   471  	movq	%mm2, disp0(%edi)
   472  L(end_even_unaligned):
   473  
   474  	movl	SAVE_EDI, %edi
   475  	addl	$SAVE_SIZE, %esp
   476  	emms
   477  
   478  	ret
   479  
   480  EPILOGUE()