github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/copyi.asm (about)

     1  dnl  Intel Pentium mpn_copyi -- copy limb vector, incrementing.
     2  
     3  dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P5: 1.25 cycles/limb
    35  
    36  
    37  C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
    38  C
    39  C Destination prefetching is done to avoid repeated write-throughs on lines
    40  C not already in L1.
    41  C
    42  C At least one of the src or dst pointer needs to be incremented rather than
    43  C using indexing, so that there's somewhere to put the loop control without
    44  C an AGI.  Incrementing one and not two lets us keep loop overhead to 2
    45  C cycles.  Making it the src pointer incremented avoids an AGI on the %ecx
    46  C subtracts in the finishup code.
    47  C
    48  C The block of finishup code is almost as big as the main loop itself, which
    49  C is unfortunate, but it's faster that way than with say rep movsl, by about
    50  C 10 cycles for instance on P55.
    51  C
    52  C There's nothing to be gained from MMX on P55, since it can do only one
    53  C movq load (or store) per cycle, so the throughput would be the same as the
    54  C code here (and even then only if src and dst have the same alignment mod
    55  C 8).
    56  
    57  defframe(PARAM_SIZE,12)
    58  defframe(PARAM_SRC, 8)
    59  defframe(PARAM_DST, 4)
    60  
    61  	TEXT
    62  	ALIGN(8)
    63  PROLOGUE(mpn_copyi)
    64  deflit(`FRAME',0)
    65  
    66  	movl	PARAM_SIZE, %ecx
    67  	movl	PARAM_DST, %edx
    68  
    69  	pushl	%ebx	FRAME_pushl()
    70  	pushl	%esi	FRAME_pushl()
    71  
    72  	leal	(%edx,%ecx,4), %edx	C &dst[size-1]
    73  	xorl	$-1, %ecx		C -size-1
    74  
    75  	movl	PARAM_SRC, %esi
    76  	addl	$8, %ecx		C -size+7
    77  
    78  	jns	L(end)
    79  
    80  	movl	-28(%edx,%ecx,4), %eax	C fetch destination cache line, dst[0]
    81  	nop
    82  
    83  L(top):
    84  	C eax	scratch
    85  	C ebx	scratch
    86  	C ecx	counter, limbs, negative
    87  	C edx	&dst[size-1]
    88  	C esi	src, incrementing
    89  	C edi
    90  	C ebp
    91  
    92  	movl	(%edx,%ecx,4), %eax	C fetch destination cache line
    93  	addl	$8, %ecx
    94  
    95  	movl	(%esi), %eax		C read words pairwise
    96  	movl	4(%esi), %ebx
    97  	movl	%eax, -60(%edx,%ecx,4)	C store words pairwise
    98  	movl	%ebx, -56(%edx,%ecx,4)
    99  
   100  	movl	8(%esi), %eax
   101  	movl	12(%esi), %ebx
   102  	movl	%eax, -52(%edx,%ecx,4)
   103  	movl	%ebx, -48(%edx,%ecx,4)
   104  
   105  	movl	16(%esi), %eax
   106  	movl	20(%esi), %ebx
   107  	movl	%eax, -44(%edx,%ecx,4)
   108  	movl	%ebx, -40(%edx,%ecx,4)
   109  
   110  	movl	24(%esi), %eax
   111  	movl	28(%esi), %ebx
   112  	movl	%eax, -36(%edx,%ecx,4)
   113  	movl	%ebx, -32(%edx,%ecx,4)
   114  
   115  	leal	32(%esi), %esi
   116  	js	L(top)
   117  
   118  
   119  L(end):
   120  	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
   121  	C esi	src end
   122  	C edx	dst, next location to store
   123  
   124  	subl	$4, %ecx
   125  	jns	L(no4)
   126  
   127  	movl	(%esi), %eax
   128  	movl	4(%esi), %ebx
   129  	movl	%eax, -12(%edx,%ecx,4)
   130  	movl	%ebx, -8(%edx,%ecx,4)
   131  
   132  	movl	8(%esi), %eax
   133  	movl	12(%esi), %ebx
   134  	movl	%eax, -4(%edx,%ecx,4)
   135  	movl	%ebx, (%edx,%ecx,4)
   136  
   137  	addl	$16, %esi
   138  	addl	$4, %ecx
   139  L(no4):
   140  
   141  	subl	$2, %ecx
   142  	jns	L(no2)
   143  
   144  	movl	(%esi), %eax
   145  	movl	4(%esi), %ebx
   146  	movl	%eax, -4(%edx,%ecx,4)
   147  	movl	%ebx, (%edx,%ecx,4)
   148  
   149  	addl	$8, %esi
   150  	addl	$2, %ecx
   151  L(no2):
   152  
   153  	jnz	L(done)
   154  
   155  	movl	(%esi), %eax
   156  	movl	%eax, -4(%edx,%ecx,4)	C risk of cache bank clash here
   157  
   158  L(done):
   159  	popl	%esi
   160  	popl	%ebx
   161  
   162  	ret
   163  
   164  EPILOGUE()