github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/lshift.asm (about)

     1  dnl  AMD K7 mpn_lshift -- mpn left shift.
     2  
     3  dnl  Copyright 1999-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K7: 1.21 cycles/limb (at 16 limbs/loop).
    35  
    36  
    37  
    38  dnl  K7: UNROLL_COUNT cycles/limb
    39  dnl           4           1.51
    40  dnl           8           1.26
    41  dnl          16           1.21
    42  dnl          32           1.2
    43  dnl  Maximum possible with the current code is 64.
    44  
    45  deflit(UNROLL_COUNT, 16)
    46  
    47  
    48  C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    49  C                       unsigned shift);
    50  C
    51  C Shift src,size left by shift many bits and store the result in dst,size.
    52  C Zeros are shifted in at the right.  The bits shifted out at the left are
    53  C the return value.
    54  C
    55  C The comments in mpn_rshift apply here too.
    56  
    57  ifdef(`PIC',`
    58  deflit(UNROLL_THRESHOLD, 10)
    59  ',`
    60  deflit(UNROLL_THRESHOLD, 10)
    61  ')
    62  
    63  defframe(PARAM_SHIFT,16)
    64  defframe(PARAM_SIZE, 12)
    65  defframe(PARAM_SRC,  8)
    66  defframe(PARAM_DST,  4)
    67  
    68  defframe(SAVE_EDI, -4)
    69  defframe(SAVE_ESI, -8)
    70  defframe(SAVE_EBX, -12)
    71  deflit(SAVE_SIZE, 12)
    72  
    73  	TEXT
    74  	ALIGN(32)
    75  
    76  PROLOGUE(mpn_lshift)
    77  deflit(`FRAME',0)
    78  
    79  	movl	PARAM_SIZE, %eax
    80  	movl	PARAM_SRC, %edx
    81  	subl	$SAVE_SIZE, %esp
    82  deflit(`FRAME',SAVE_SIZE)
    83  
    84  	movl	PARAM_SHIFT, %ecx
    85  	movl	%edi, SAVE_EDI
    86  
    87  	movl	PARAM_DST, %edi
    88  	decl	%eax
    89  	jnz	L(more_than_one_limb)
    90  
    91  	movl	(%edx), %edx
    92  
    93  	shldl(	%cl, %edx, %eax)	C eax was decremented to zero
    94  
    95  	shll	%cl, %edx
    96  
    97  	movl	%edx, (%edi)
    98  	movl	SAVE_EDI, %edi
    99  	addl	$SAVE_SIZE, %esp
   100  
   101  	ret
   102  
   103  
   104  C -----------------------------------------------------------------------------
   105  L(more_than_one_limb):
   106  	C eax	size-1
   107  	C ebx
   108  	C ecx	shift
   109  	C edx	src
   110  	C esi
   111  	C edi	dst
   112  	C ebp
   113  
   114  	movd	PARAM_SHIFT, %mm6
   115  	movd	(%edx,%eax,4), %mm5	C src high limb
   116  	cmp	$UNROLL_THRESHOLD-1, %eax
   117  
   118  	jae	L(unroll)
   119  	negl	%ecx
   120  	movd	(%edx), %mm4		C src low limb
   121  
   122  	addl	$32, %ecx
   123  
   124  	movd	%ecx, %mm7
   125  
   126  L(simple_top):
   127  	C eax	loop counter, limbs
   128  	C ebx
   129  	C ecx
   130  	C edx	src
   131  	C esi
   132  	C edi	dst
   133  	C ebp
   134  	C
   135  	C mm0	scratch
   136  	C mm4	src low limb
   137  	C mm5	src high limb
   138  	C mm6	shift
   139  	C mm7	32-shift
   140  
   141  	movq	-4(%edx,%eax,4), %mm0
   142  	decl	%eax
   143  
   144  	psrlq	%mm7, %mm0
   145  
   146  	movd	%mm0, 4(%edi,%eax,4)
   147  	jnz	L(simple_top)
   148  
   149  
   150  	psllq	%mm6, %mm5
   151  	psllq	%mm6, %mm4
   152  
   153  	psrlq	$32, %mm5
   154  	movd	%mm4, (%edi)		C dst low limb
   155  
   156  	movd	%mm5, %eax		C return value
   157  
   158  	movl	SAVE_EDI, %edi
   159  	addl	$SAVE_SIZE, %esp
   160  	emms
   161  
   162  	ret
   163  
   164  
   165  C -----------------------------------------------------------------------------
   166  	ALIGN(16)
   167  L(unroll):
   168  	C eax	size-1
   169  	C ebx	(saved)
   170  	C ecx	shift
   171  	C edx	src
   172  	C esi
   173  	C edi	dst
   174  	C ebp
   175  	C
   176  	C mm5	src high limb, for return value
   177  	C mm6	lshift
   178  
   179  	movl	%esi, SAVE_ESI
   180  	movl	%ebx, SAVE_EBX
   181  	leal	-4(%edx,%eax,4), %edx   C &src[size-2]
   182  
   183  	testb	$4, %dl
   184  	movq	(%edx), %mm1		C src high qword
   185  
   186  	jz	L(start_src_aligned)
   187  
   188  
   189  	C src isn't aligned, process high limb (marked xxx) separately to
   190  	C make it so
   191  	C
   192  	C  source    -4(edx,%eax,4)
   193  	C                  |
   194  	C  +-------+-------+-------+--
   195  	C  |  xxx          |
   196  	C  +-------+-------+-------+--
   197  	C        0mod8   4mod8   0mod8
   198  	C
   199  	C  dest      -4(edi,%eax,4)
   200  	C                  |
   201  	C  +-------+-------+--
   202  	C  |  xxx  |       |
   203  	C  +-------+-------+--
   204  
   205  	psllq	%mm6, %mm1
   206  	subl	$4, %edx
   207  	movl	%eax, PARAM_SIZE	C size-1
   208  
   209  	psrlq	$32, %mm1
   210  	decl	%eax			C size-2 is new size-1
   211  
   212  	movd	%mm1, 4(%edi,%eax,4)
   213  	movq	(%edx), %mm1		C new src high qword
   214  L(start_src_aligned):
   215  
   216  
   217  	leal	-4(%edi,%eax,4), %edi   C &dst[size-2]
   218  	psllq	%mm6, %mm5
   219  
   220  	testl	$4, %edi
   221  	psrlq	$32, %mm5		C return value
   222  
   223  	jz	L(start_dst_aligned)
   224  
   225  
   226  	C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
   227  	C shift is 32 bits extra.  High limb of dst (marked xxx) handled
   228  	C here separately.
   229  	C
   230  	C  source       %edx
   231  	C  +-------+-------+--
   232  	C  |      mm1      |
   233  	C  +-------+-------+--
   234  	C                0mod8   4mod8
   235  	C
   236  	C  dest         %edi
   237  	C  +-------+-------+-------+--
   238  	C  |  xxx  |
   239  	C  +-------+-------+-------+--
   240  	C        0mod8   4mod8   0mod8
   241  
   242  	movq	%mm1, %mm0
   243  	psllq	%mm6, %mm1
   244  	addl	$32, %ecx		C shift+32
   245  
   246  	psrlq	$32, %mm1
   247  
   248  	movd	%mm1, 4(%edi)
   249  	movq	%mm0, %mm1
   250  	subl	$4, %edi
   251  
   252  	movd	%ecx, %mm6		C new lshift
   253  L(start_dst_aligned):
   254  
   255  	decl	%eax			C size-2, two last limbs handled at end
   256  	movq	%mm1, %mm2		C copy of src high qword
   257  	negl	%ecx
   258  
   259  	andl	$-2, %eax		C round size down to even
   260  	addl	$64, %ecx
   261  
   262  	movl	%eax, %ebx
   263  	negl	%eax
   264  
   265  	andl	$UNROLL_MASK, %eax
   266  	decl	%ebx
   267  
   268  	shll	%eax
   269  
   270  	movd	%ecx, %mm7		C rshift = 64-lshift
   271  
   272  ifdef(`PIC',`
   273  	call	L(pic_calc)
   274  L(here):
   275  ',`
   276  	leal	L(entry) (%eax,%eax,4), %esi
   277  ')
   278  	shrl	$UNROLL_LOG2, %ebx	C loop counter
   279  
   280  	leal	ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
   281  	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
   282  	movl	PARAM_SIZE, %eax	C for use at end
   283  	jmp	*%esi
   284  
   285  
   286  ifdef(`PIC',`
   287  L(pic_calc):
   288  	C See mpn/x86/README about old gas bugs
   289  	leal	(%eax,%eax,4), %esi
   290  	addl	$L(entry)-L(here), %esi
   291  	addl	(%esp), %esi
   292  
   293  	ret_internal
   294  ')
   295  
   296  
   297  C -----------------------------------------------------------------------------
   298  	ALIGN(32)
   299  L(top):
   300  	C eax	size (for use at end)
   301  	C ebx	loop counter
   302  	C ecx	rshift
   303  	C edx	src
   304  	C esi	computed jump
   305  	C edi	dst
   306  	C ebp
   307  	C
   308  	C mm0	scratch
   309  	C mm1	\ carry (alternating, mm2 first)
   310  	C mm2	/
   311  	C mm6	lshift
   312  	C mm7	rshift
   313  	C
   314  	C 10 code bytes/limb
   315  	C
   316  	C The two chunks differ in whether mm1 or mm2 hold the carry.
   317  	C The computed jump puts the initial carry in both mm1 and mm2.
   318  
   319  L(entry):
   320  deflit(CHUNK_COUNT, 4)
   321  forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
   322  	deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
   323  	deflit(`disp1', eval(disp0 - 8))
   324  
   325  Zdisp(	movq,	disp0,(%edx), %mm0)
   326  	psllq	%mm6, %mm2
   327  
   328  	movq	%mm0, %mm1
   329  	psrlq	%mm7, %mm0
   330  
   331  	por	%mm2, %mm0
   332  Zdisp(	movq,	%mm0, disp0,(%edi))
   333  
   334  
   335  Zdisp(	movq,	disp1,(%edx), %mm0)
   336  	psllq	%mm6, %mm1
   337  
   338  	movq	%mm0, %mm2
   339  	psrlq	%mm7, %mm0
   340  
   341  	por	%mm1, %mm0
   342  Zdisp(	movq,	%mm0, disp1,(%edi))
   343  ')
   344  
   345  	subl	$UNROLL_BYTES, %edx
   346  	subl	$UNROLL_BYTES, %edi
   347  	decl	%ebx
   348  
   349  	jns	L(top)
   350  
   351  
   352  
   353  define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
   354  
   355  L(end):
   356  	testb	$1, %al
   357  	movl	SAVE_EBX, %ebx
   358  	psllq	%mm6, %mm2	C wanted left shifted in all cases below
   359  
   360  	movd	%mm5, %eax
   361  
   362  	movl	SAVE_ESI, %esi
   363  	jz	L(end_even)
   364  
   365  
   366  L(end_odd):
   367  
   368  	C Size odd, destination was aligned.
   369  	C
   370  	C                 source        edx+8   edx+4
   371  	C                 --+---------------+-------+
   372  	C                   |      mm2      |       |
   373  	C                 --+---------------+-------+
   374  	C
   375  	C dest                            edi
   376  	C --+---------------+---------------+-------+
   377  	C   |   written     |               |       |
   378  	C --+---------------+---------------+-------+
   379  	C
   380  	C mm6 = shift
   381  	C mm7 = ecx = 64-shift
   382  
   383  
   384  	C Size odd, destination was unaligned.
   385  	C
   386  	C                 source        edx+8   edx+4
   387  	C                 --+---------------+-------+
   388  	C                   |      mm2      |       |
   389  	C                 --+---------------+-------+
   390  	C
   391  	C         dest                            edi
   392  	C         --+---------------+---------------+
   393  	C           |   written     |               |
   394  	C         --+---------------+---------------+
   395  	C
   396  	C mm6 = shift+32
   397  	C mm7 = ecx = 64-(shift+32)
   398  
   399  
   400  	C In both cases there's one extra limb of src to fetch and combine
   401  	C with mm2 to make a qword at (%edi), and in the aligned case
   402  	C there's an extra limb of dst to be formed from that extra src limb
   403  	C left shifted.
   404  
   405  	movd	disp(4) (%edx), %mm0
   406  	testb	$32, %cl
   407  
   408  	movq	%mm0, %mm1
   409  	psllq	$32, %mm0
   410  
   411  	psrlq	%mm7, %mm0
   412  	psllq	%mm6, %mm1
   413  
   414  	por	%mm2, %mm0
   415  
   416  	movq	%mm0, disp(0) (%edi)
   417  	jz	L(end_odd_unaligned)
   418  	movd	%mm1, disp(-4) (%edi)
   419  L(end_odd_unaligned):
   420  
   421  	movl	SAVE_EDI, %edi
   422  	addl	$SAVE_SIZE, %esp
   423  	emms
   424  
   425  	ret
   426  
   427  
   428  L(end_even):
   429  
   430  	C Size even, destination was aligned.
   431  	C
   432  	C                 source        edx+8
   433  	C                 --+---------------+
   434  	C                   |      mm2      |
   435  	C                 --+---------------+
   436  	C
   437  	C dest                            edi
   438  	C --+---------------+---------------+
   439  	C   |   written     |               |
   440  	C --+---------------+---------------+
   441  	C
   442  	C mm6 = shift
   443  	C mm7 = ecx = 64-shift
   444  
   445  
   446  	C Size even, destination was unaligned.
   447  	C
   448  	C               source          edx+8
   449  	C                 --+---------------+
   450  	C                   |      mm2      |
   451  	C                 --+---------------+
   452  	C
   453  	C         dest                  edi+4
   454  	C         --+---------------+-------+
   455  	C           |    written    |       |
   456  	C         --+---------------+-------+
   457  	C
   458  	C mm6 = shift+32
   459  	C mm7 = ecx = 64-(shift+32)
   460  
   461  
   462  	C The movq for the aligned case overwrites the movd for the
   463  	C unaligned case.
   464  
   465  	movq	%mm2, %mm0
   466  	psrlq	$32, %mm2
   467  
   468  	testb	$32, %cl
   469  	movd	%mm2, disp(4) (%edi)
   470  
   471  	jz	L(end_even_unaligned)
   472  	movq	%mm0, disp(0) (%edi)
   473  L(end_even_unaligned):
   474  
   475  	movl	SAVE_EDI, %edi
   476  	addl	$SAVE_SIZE, %esp
   477  	emms
   478  
   479  	ret
   480  
   481  EPILOGUE()