github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/copyi.asm (about)

     1  dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
     2  
     3  dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C    alignment dst/src, A=0mod8 N=4mod8
    35  C       A/A   A/N   N/A   N/N
    36  C K7    0.75  1.0   1.0   0.75
    37  
    38  
    39  C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
    40  C
    41  C Copy src,size to dst,size.
    42  C
    43  C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
    44  C 1.33 c/l.
    45  C
    46  C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization
    47  C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing
    48  C under 0.7 c/l is known.  Apparently only two 32-bit stores can be done in
    49  C one cycle, so perhaps some scheduling is needed to ensure it's a
    50  C load+store in each cycle, not store+store.
    51  C
    52  C If both source and destination are unaligned then one limb is processed at
    53  C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
    54  C used unaligned it would be 1.5 c/l.
    55  
    56  defframe(PARAM_SIZE,12)
    57  defframe(PARAM_SRC, 8)
    58  defframe(PARAM_DST, 4)
    59  
    60  dnl  parameter space reused
    61  define(SAVE_EBX,`PARAM_SIZE')
    62  
    63  dnl  minimum 5 since the unrolled code can't handle less than 5
    64  deflit(UNROLL_THRESHOLD, 5)
    65  
    66  	TEXT
    67  	ALIGN(32)
    68  PROLOGUE(mpn_copyi)
    69  deflit(`FRAME',0)
    70  
    71  	movl	PARAM_SIZE, %ecx
    72  	movl	%ebx, SAVE_EBX
    73  
    74  	movl	PARAM_SRC, %eax
    75  	movl	PARAM_DST, %edx
    76  
    77  	cmpl	$UNROLL_THRESHOLD, %ecx
    78  	jae	L(unroll)
    79  
    80  	orl	%ecx, %ecx
    81  	jz	L(simple_done)
    82  
    83  L(simple):
    84  	C eax	src, incrementing
    85  	C ebx	scratch
    86  	C ecx	counter
    87  	C edx	dst, incrementing
    88  	C
    89  	C this loop is 2 cycles/limb
    90  
    91  	movl	(%eax), %ebx
    92  	movl	%ebx, (%edx)
    93  	decl	%ecx
    94  	leal	4(%eax), %eax
    95  	leal	4(%edx), %edx
    96  	jnz	L(simple)
    97  
    98  L(simple_done):
    99  	movl	SAVE_EBX, %ebx
   100  	ret
   101  
   102  
   103  L(unroll):
   104  	movl	%eax, %ebx
   105  	leal	-12(%eax,%ecx,4), %eax	C src end - 12
   106  	subl	$3, %ecx		C size-3
   107  
   108  	andl	%edx, %ebx
   109  	leal	(%edx,%ecx,4), %edx	C dst end - 12
   110  	negl	%ecx
   111  
   112  	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
   113  	jz	L(aligned)
   114  
   115  	C both src and dst unaligned, process one limb to align them
   116  	movl	(%eax,%ecx,4), %ebx
   117  	movl	%ebx, (%edx,%ecx,4)
   118  	incl	%ecx
   119  L(aligned):
   120  
   121  
   122  	ALIGN(16)
   123  L(top):
   124  	C eax	src end - 12
   125  	C ebx
   126  	C ecx	counter, negative, limbs
   127  	C edx	dst end - 12
   128  
   129  	movq	(%eax,%ecx,4), %mm0
   130  	movq	8(%eax,%ecx,4), %mm1
   131  	addl	$4, %ecx
   132  	movq	%mm0, -16(%edx,%ecx,4)
   133  	movq	%mm1, -16+8(%edx,%ecx,4)
   134  	ja	L(top)		C jump no carry and not zero
   135  
   136  
   137  	C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
   138  
   139  	testb	$2, %cl
   140  	jnz	L(finish_not_two)
   141  
   142  	movq	(%eax,%ecx,4), %mm0
   143  	movq	%mm0, (%edx,%ecx,4)
   144  L(finish_not_two):
   145  
   146  	testb	$1, %cl
   147  	jnz	L(done)
   148  
   149  	movl	8(%eax), %ebx
   150  	movl	%ebx, 8(%edx)
   151  
   152  L(done):
   153  	movl	SAVE_EBX, %ebx
   154  	emms
   155  	ret
   156  
   157  EPILOGUE()