github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/copyd.asm (about)

     1  dnl  Intel P6 mpn_copyd -- copy limb vector backwards.
     2  
     3  dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P6: 1.75 cycles/limb, or 0.75 if no overlap
    35  
    36  
    37  C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
    38  C
    39  C An explicit loop is used because a decrementing rep movsl is a bit slow at
    40  C 2.4 c/l.  That rep movsl also has about a 40 cycle startup time, and the
    41  C code here stands a chance of being faster if the branches predict well.
    42  C
    43  C The slightly strange loop form seems necessary for the claimed speed.
    44  C Maybe load/store ordering affects it.
    45  C
    46  C The source and destination are checked to see if they're actually
    47  C overlapping, since it might be possible to use an incrementing rep movsl
    48  C at 0.75 c/l.  (It doesn't suffer the bad startup time of the decrementing
    49  C version.)
    50  C
    51  C Enhancements:
    52  C
    53  C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
    54  C one store each cycle.  Unrolling the loop below would approach 1.0, but
    55  C it'd be good to know why something like store/load/subl + store/load/jnz
    56  C doesn't already run at 1.0 c/l.  It looks like it should decode in 2
    57  C cycles, but doesn't run that way.
    58  
    59  defframe(PARAM_SIZE,12)
    60  defframe(PARAM_SRC, 8)
    61  defframe(PARAM_DST, 4)
    62  
    63  dnl  re-using parameter space
    64  define(SAVE_ESI,`PARAM_SIZE')
    65  define(SAVE_EDI,`PARAM_SRC')
    66  
    67  	TEXT
    68  	ALIGN(16)
    69  
    70  PROLOGUE(mpn_copyd)
    71  deflit(`FRAME',0)
    72  
    73  	movl	PARAM_SIZE, %ecx
    74  
    75  	movl	%esi, SAVE_ESI
    76  	movl	PARAM_SRC, %esi
    77  
    78  	movl	%edi, SAVE_EDI
    79  	movl	PARAM_DST, %edi
    80  
    81  	subl	$1, %ecx
    82  	jb	L(zero)
    83  
    84  	movl	(%esi,%ecx,4), %eax		C src[size-1]
    85  	jz	L(one)
    86  
    87  	movl	-4(%esi,%ecx,4), %edx		C src[size-2]
    88  	subl	$2, %ecx
    89  	jbe	L(done_loop)			C 2 or 3 limbs only
    90  
    91  
    92  	C The usual overlap is
    93  	C
    94  	C     high                   low
    95  	C     +------------------+
    96  	C     |               dst|
    97  	C     +------------------+
    98  	C           +------------------+
    99  	C           |               src|
   100  	C           +------------------+
   101  	C
   102  	C We can use an incrementing copy in the following circumstances.
   103  	C
   104  	C     src+4*size<=dst, since then the regions are disjoint
   105  	C
   106  	C     src==dst, clearly (though this shouldn't occur normally)
   107  	C
   108  	C     src>dst, since in that case it's a requirement of the
   109  	C              parameters that src>=dst+size*4, and hence the
   110  	C              regions are disjoint
   111  	C
   112  
   113  	leal	(%edi,%ecx,4), %edx
   114  	cmpl	%edi, %esi
   115  	jae	L(use_movsl)		C src >= dst
   116  
   117  	cmpl	%edi, %edx
   118  	movl	4(%esi,%ecx,4), %edx	C src[size-2] again
   119  	jbe	L(use_movsl)		C src+4*size <= dst
   120  
   121  
   122  L(top):
   123  	C eax	prev high limb
   124  	C ebx
   125  	C ecx	counter, size-3 down to 0 or -1, inclusive, by 2s
   126  	C edx	prev low limb
   127  	C esi	src
   128  	C edi	dst
   129  	C ebp
   130  
   131  	movl	%eax, 8(%edi,%ecx,4)
   132  	movl	(%esi,%ecx,4), %eax
   133  
   134  	movl	%edx, 4(%edi,%ecx,4)
   135  	movl	-4(%esi,%ecx,4), %edx
   136  
   137  	subl	$2, %ecx
   138  	jnbe	L(top)
   139  
   140  
   141  L(done_loop):
   142  	movl	%eax, 8(%edi,%ecx,4)
   143  	movl	%edx, 4(%edi,%ecx,4)
   144  
   145  	C copy low limb (needed if size was odd, but will already have been
   146  	C done in the loop if size was even)
   147  	movl	(%esi), %eax
   148  L(one):
   149  	movl	%eax, (%edi)
   150  	movl	SAVE_EDI, %edi
   151  	movl	SAVE_ESI, %esi
   152  
   153  	ret
   154  
   155  
   156  L(use_movsl):
   157  	C eax
   158  	C ebx
   159  	C ecx	size-3
   160  	C edx
   161  	C esi	src
   162  	C edi	dst
   163  	C ebp
   164  
   165  	addl	$3, %ecx
   166  
   167  	cld		C better safe than sorry, see mpn/x86/README
   168  
   169  	rep
   170  	movsl
   171  
   172  L(zero):
   173  	movl	SAVE_ESI, %esi
   174  	movl	SAVE_EDI, %edi
   175  
   176  	ret
   177  
   178  EPILOGUE()