github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/copyi.asm (about)

     1  dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE.
     2  
     3  dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
     4  dnl  Inc.
     5  
     6  dnl  Contributed to the GNU project by Torbjörn Granlund.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  C	     cycles/limb     cycles/limb     cycles/limb      good
    37  C              aligned	      unaligned	      best seen	     for cpu?
    38  C AMD K8,K9
    39  C AMD K10	 0.85		 1.64				Y/N
    40  C AMD bull	 1.4		 1.4				N
    41  C AMD pile	 0.77		 0.93				N
    42  C AMD steam	 ?		 ?
    43  C AMD excavator	 ?		 ?
    44  C AMD bobcat
    45  C AMD jaguar	 0.65		 1.02		opt/0.93	Y/N
    46  C Intel P4	 2.3		 2.3				Y
    47  C Intel core	 1.0		 1.0		0.52/0.64	N
    48  C Intel NHM	 0.5		 0.67				Y
    49  C Intel SBR	 0.51		 0.75		opt/0.54	Y/N
    50  C Intel IBR	 0.50		 0.57		opt/0.54	Y
    51  C Intel HWL	 0.50		 0.57		opt/0.51	Y
    52  C Intel BWL	 0.55		 0.62		opt/0.55	Y
    53  C Intel atom
    54  C Intel SLM	 1.02		 1.27		opt/1.07	Y/N
    55  C VIA nano	 1.16		 5.16				Y/N
    56  
    57  C We try to do as many 16-byte operations as possible.  The top-most and
    58  C bottom-most writes might need 8-byte operations.  We can always write using
    59  C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
    60  C operations.
    61  
    62  C Instead of having separate loops for reading aligned and unaligned, we read
    63  C using MOVDQU.  This seems to work great except for core2; there performance
    64  C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
    65  C best handle the unaligned case there.
    66  
    67  C INPUT PARAMETERS
    68  define(`rp', `%rdi')
    69  define(`up', `%rsi')
    70  define(`n',  `%rdx')
    71  
    72  ABI_SUPPORT(DOS64)
    73  ABI_SUPPORT(STD64)
    74  
    75  dnl define(`movdqu', lddqu)
    76  
    77  ASM_START()
    78  	TEXT
    79  	ALIGN(64)
    80  PROLOGUE(mpn_copyi)
    81  	FUNC_ENTRY(3)
    82  
    83  	cmp	$3, n			C NB: bc code below assumes this limit
    84  	jc	L(bc)
    85  
    86  	test	$8, R8(rp)		C is rp 16-byte aligned?
    87  	jz	L(ali)			C jump if rp aligned
    88  	movsq				C copy single limb
    89  	dec	n
    90  
    91  	sub	$16, n
    92  	jc	L(sma)
    93  
    94  	ALIGN(16)
    95  L(top):	movdqu	(up), %xmm0
    96  	movdqu	16(up), %xmm1
    97  	movdqu	32(up), %xmm2
    98  	movdqu	48(up), %xmm3
    99  	movdqu	64(up), %xmm4
   100  	movdqu	80(up), %xmm5
   101  	movdqu	96(up), %xmm6
   102  	movdqu	112(up), %xmm7
   103  	lea	128(up), up
   104  	movdqa	%xmm0, (rp)
   105  	movdqa	%xmm1, 16(rp)
   106  	movdqa	%xmm2, 32(rp)
   107  	movdqa	%xmm3, 48(rp)
   108  	movdqa	%xmm4, 64(rp)
   109  	movdqa	%xmm5, 80(rp)
   110  	movdqa	%xmm6, 96(rp)
   111  	movdqa	%xmm7, 112(rp)
   112  	lea	128(rp), rp
   113  L(ali):	sub	$16, n
   114  	jnc	L(top)
   115  
   116  L(sma):	test	$8, R8(n)
   117  	jz	1f
   118  	movdqu	(up), %xmm0
   119  	movdqu	16(up), %xmm1
   120  	movdqu	32(up), %xmm2
   121  	movdqu	48(up), %xmm3
   122  	lea	64(up), up
   123  	movdqa	%xmm0, (rp)
   124  	movdqa	%xmm1, 16(rp)
   125  	movdqa	%xmm2, 32(rp)
   126  	movdqa	%xmm3, 48(rp)
   127  	lea	64(rp), rp
   128  1:
   129  	test	$4, R8(n)
   130  	jz	1f
   131  	movdqu	(up), %xmm0
   132  	movdqu	16(up), %xmm1
   133  	lea	32(up), up
   134  	movdqa	%xmm0, (rp)
   135  	movdqa	%xmm1, 16(rp)
   136  	lea	32(rp), rp
   137  1:
   138  	test	$2, R8(n)
   139  	jz	1f
   140  	movdqu	(up), %xmm0
   141  	lea	16(up), up
   142  	movdqa	%xmm0, (rp)
   143  	lea	16(rp), rp
   144  	ALIGN(16)
   145  1:
   146  L(end):	test	$1, R8(n)
   147  	jz	1f
   148  	mov	(up), %r8
   149  	mov	%r8, (rp)
   150  1:
   151  	FUNC_EXIT()
   152  	ret
   153  
   154  C Basecase code.  Needed for good small operands speed, not for correctness as
   155  C the above code is currently written.  The commented-out lines need to be
   156  C reinstated if this code is to be used for n > 3, and then the post loop
   157  C offsets need fixing.
   158  
   159  L(bc):	sub	$2, n
   160  	jc	L(end)
   161  	ALIGN(16)
   162  1:	mov	(up), %rax
   163  	mov	8(up), %rcx
   164  dnl	lea	16(up), up
   165  	mov	%rax, (rp)
   166  	mov	%rcx, 8(rp)
   167  dnl	lea	16(rp), rp
   168  dnl	sub	$2, n
   169  dnl	jnc	1b
   170  
   171  	test	$1, R8(n)
   172  	jz	L(ret)
   173  	mov	16(up), %rax
   174  	mov	%rax, 16(rp)
   175  L(ret):	FUNC_EXIT()
   176  	ret
   177  EPILOGUE()