github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/copyi.asm (about)

     1  dnl  Pentium-4 mpn_copyi -- copy limb vector, incrementing.
     2  
     3  dnl  Copyright 1999-2001 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  
    32  dnl  The rep/movsl is very slow for small blocks on pentium4.  Its startup
    33  dnl  time seems to be about 110 cycles.  It then copies at a rate of one
    34  dnl  limb per cycle.  We therefore fall back to an open-coded 2 c/l copying
    35  dnl  loop for smaller sizes.
    36  
    37  dnl  Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some
    38  dnl  nifty unrolled arrangement.  Clearly, that could reach much higher
    39  dnl  speeds, at least for large blocks.
    40  
    41  include(`../config.m4')
    42  
    43  
    44  defframe(PARAM_SIZE, 12)
    45  defframe(PARAM_SRC, 8)
    46  defframe(PARAM_DST,  4)
    47  
    48  	TEXT
    49  	ALIGN(8)
    50  
    51  PROLOGUE(mpn_copyi)
    52  deflit(`FRAME',0)
    53  
    54  	movl	PARAM_SIZE, %ecx
    55  	cmpl	$150, %ecx
    56  	jg	L(replmovs)
    57  
    58  	movl	PARAM_SRC, %eax
    59  	movl	PARAM_DST, %edx
    60  	movl	%ebx, PARAM_SIZE
    61  	testl	%ecx, %ecx
    62  	jz	L(end)
    63  
    64  L(loop):
    65  	movl	(%eax), %ebx
    66  	leal	4(%eax), %eax
    67  	addl	$-1, %ecx
    68  	movl	%ebx, (%edx)
    69  	leal	4(%edx), %edx
    70  
    71  	jnz	L(loop)
    72  
    73  L(end):
    74  	movl	PARAM_SIZE, %ebx
    75  	ret
    76  
    77  L(replmovs):
    78  	cld	C better safe than sorry, see mpn/x86/README
    79  
    80  	movl	%esi, %eax
    81  	movl	PARAM_SRC, %esi
    82  	movl	%edi, %edx
    83  	movl	PARAM_DST, %edi
    84  
    85  	rep
    86  	movsl
    87  
    88  	movl	%eax, %esi
    89  	movl	%edx, %edi
    90  
    91  	ret
    92  
    93  EPILOGUE()