github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/lshift.asm (about)

     1  dnl  Intel Pentium mpn_lshift -- mpn left shift.
     2  
     3  dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C         cycles/limb
    35  C P5,P54:    6.0
    36  C P55:       5.375
    37  
    38  
    39  C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    40  C                       unsigned shift);
    41  C
    42  C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
    43  C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
    44  
    45  defframe(PARAM_SHIFT,16)
    46  defframe(PARAM_SIZE, 12)
    47  defframe(PARAM_SRC,  8)
    48  defframe(PARAM_DST,  4)
    49  
    50  	TEXT
    51  	ALIGN(8)
    52  PROLOGUE(mpn_lshift)
    53  
    54  	pushl	%edi
    55  	pushl	%esi
    56  	pushl	%ebx
    57  	pushl	%ebp
    58  deflit(`FRAME',16)
    59  
    60  	movl	PARAM_DST,%edi
    61  	movl	PARAM_SRC,%esi
    62  	movl	PARAM_SIZE,%ebp
    63  	movl	PARAM_SHIFT,%ecx
    64  
    65  C We can use faster code for shift-by-1 under certain conditions.
    66  	cmp	$1,%ecx
    67  	jne	L(normal)
    68  	leal	4(%esi),%eax
    69  	cmpl	%edi,%eax
    70  	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
    71  	leal	(%esi,%ebp,4),%eax
    72  	cmpl	%eax,%edi
    73  	jnc	L(special)		C jump if res_ptr >= s_ptr + size
    74  
    75  L(normal):
    76  	leal	-4(%edi,%ebp,4),%edi
    77  	leal	-4(%esi,%ebp,4),%esi
    78  
    79  	movl	(%esi),%edx
    80  	subl	$4,%esi
    81  	xorl	%eax,%eax
    82  	shldl(	%cl, %edx, %eax)	C compute carry limb
    83  	pushl	%eax			C push carry limb onto stack
    84  
    85  	decl	%ebp
    86  	pushl	%ebp
    87  	shrl	$3,%ebp
    88  	jz	L(end)
    89  
    90  	movl	(%edi),%eax		C fetch destination cache line
    91  
    92  	ALIGN(4)
    93  L(oop):	movl	-28(%edi),%eax		C fetch destination cache line
    94  	movl	%edx,%ebx
    95  
    96  	movl	(%esi),%eax
    97  	movl	-4(%esi),%edx
    98  	shldl(	%cl, %eax, %ebx)
    99  	shldl(	%cl, %edx, %eax)
   100  	movl	%ebx,(%edi)
   101  	movl	%eax,-4(%edi)
   102  
   103  	movl	-8(%esi),%ebx
   104  	movl	-12(%esi),%eax
   105  	shldl(	%cl, %ebx, %edx)
   106  	shldl(	%cl, %eax, %ebx)
   107  	movl	%edx,-8(%edi)
   108  	movl	%ebx,-12(%edi)
   109  
   110  	movl	-16(%esi),%edx
   111  	movl	-20(%esi),%ebx
   112  	shldl(	%cl, %edx, %eax)
   113  	shldl(	%cl, %ebx, %edx)
   114  	movl	%eax,-16(%edi)
   115  	movl	%edx,-20(%edi)
   116  
   117  	movl	-24(%esi),%eax
   118  	movl	-28(%esi),%edx
   119  	shldl(	%cl, %eax, %ebx)
   120  	shldl(	%cl, %edx, %eax)
   121  	movl	%ebx,-24(%edi)
   122  	movl	%eax,-28(%edi)
   123  
   124  	subl	$32,%esi
   125  	subl	$32,%edi
   126  	decl	%ebp
   127  	jnz	L(oop)
   128  
   129  L(end):	popl	%ebp
   130  	andl	$7,%ebp
   131  	jz	L(end2)
   132  L(oop2):
   133  	movl	(%esi),%eax
   134  	shldl(	%cl,%eax,%edx)
   135  	movl	%edx,(%edi)
   136  	movl	%eax,%edx
   137  	subl	$4,%esi
   138  	subl	$4,%edi
   139  	decl	%ebp
   140  	jnz	L(oop2)
   141  
   142  L(end2):
   143  	shll	%cl,%edx		C compute least significant limb
   144  	movl	%edx,(%edi)		C store it
   145  
   146  	popl	%eax			C pop carry limb
   147  
   148  	popl	%ebp
   149  	popl	%ebx
   150  	popl	%esi
   151  	popl	%edi
   152  	ret
   153  
   154  
   155  C We loop from least significant end of the arrays, which is only
   156  C permissable if the source and destination don't overlap, since the
   157  C function is documented to work for overlapping source and destination.
   158  
   159  L(special):
   160  	movl	(%esi),%edx
   161  	addl	$4,%esi
   162  
   163  	decl	%ebp
   164  	pushl	%ebp
   165  	shrl	$3,%ebp
   166  
   167  	addl	%edx,%edx
   168  	incl	%ebp
   169  	decl	%ebp
   170  	jz	L(Lend)
   171  
   172  	movl	(%edi),%eax		C fetch destination cache line
   173  
   174  	ALIGN(4)
   175  L(Loop):
   176  	movl	28(%edi),%eax		C fetch destination cache line
   177  	movl	%edx,%ebx
   178  
   179  	movl	(%esi),%eax
   180  	movl	4(%esi),%edx
   181  	adcl	%eax,%eax
   182  	movl	%ebx,(%edi)
   183  	adcl	%edx,%edx
   184  	movl	%eax,4(%edi)
   185  
   186  	movl	8(%esi),%ebx
   187  	movl	12(%esi),%eax
   188  	adcl	%ebx,%ebx
   189  	movl	%edx,8(%edi)
   190  	adcl	%eax,%eax
   191  	movl	%ebx,12(%edi)
   192  
   193  	movl	16(%esi),%edx
   194  	movl	20(%esi),%ebx
   195  	adcl	%edx,%edx
   196  	movl	%eax,16(%edi)
   197  	adcl	%ebx,%ebx
   198  	movl	%edx,20(%edi)
   199  
   200  	movl	24(%esi),%eax
   201  	movl	28(%esi),%edx
   202  	adcl	%eax,%eax
   203  	movl	%ebx,24(%edi)
   204  	adcl	%edx,%edx
   205  	movl	%eax,28(%edi)
   206  
   207  	leal	32(%esi),%esi		C use leal not to clobber carry
   208  	leal	32(%edi),%edi
   209  	decl	%ebp
   210  	jnz	L(Loop)
   211  
   212  L(Lend):
   213  	popl	%ebp
   214  	sbbl	%eax,%eax		C save carry in %eax
   215  	andl	$7,%ebp
   216  	jz	L(Lend2)
   217  	addl	%eax,%eax		C restore carry from eax
   218  L(Loop2):
   219  	movl	%edx,%ebx
   220  	movl	(%esi),%edx
   221  	adcl	%edx,%edx
   222  	movl	%ebx,(%edi)
   223  
   224  	leal	4(%esi),%esi		C use leal not to clobber carry
   225  	leal	4(%edi),%edi
   226  	decl	%ebp
   227  	jnz	L(Loop2)
   228  
   229  	jmp	L(L1)
   230  L(Lend2):
   231  	addl	%eax,%eax		C restore carry from eax
   232  L(L1):	movl	%edx,(%edi)		C store last limb
   233  
   234  	sbbl	%eax,%eax
   235  	negl	%eax
   236  
   237  	popl	%ebp
   238  	popl	%ebx
   239  	popl	%esi
   240  	popl	%edi
   241  	ret
   242  
   243  EPILOGUE()