github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/copyd.asm (about)

     1  dnl  AMD64 mpn_copyd optimised for CPUs with fast SSE.
     2  
     3  dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
     4  dnl  Inc.
     5  
     6  dnl  Contributed to the GNU project by Torbjörn Granlund.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  C	     cycles/limb     cycles/limb     cycles/limb      good
    37  C              aligned	      unaligned	      best seen	     for cpu?
    38  C AMD K8,K9
    39  C AMD K10	 0.85		 1.64				Y/N
    40  C AMD bull	 1.4		 1.4				Y
    41  C AMD pile	 0.68		 0.98				Y/N
    42  C AMD steam
    43  C AMD excavator
    44  C AMD bobcat
    45  C AMD jaguar	 0.65		 1.02		opt/0.93	Y/N
    46  C Intel P4	 2.3		 2.3				Y
    47  C Intel core	 1.0		 1.0		0.52/0.80	N
    48  C Intel NHM	 0.5		 0.67				Y
    49  C Intel SBR	 0.51		 0.75		opt/0.54	Y/N
    50  C Intel IBR	 0.50		 0.57		opt/0.50	Y
    51  C Intel HWL	 0.50		 0.57		opt/0.51	Y
    52  C Intel BWL	 0.55		 0.62		opt/0.55	Y
    53  C Intel atom
    54  C Intel SLM	 1.02		 1.27		opt/1.04	Y/N
    55  C VIA nano	 1.16		 5.16				Y/N
    56  
    57  C We try to do as many 16-byte operations as possible.  The top-most and
    58  C bottom-most writes might need 8-byte operations.  We can always write using
    59  C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
    60  C operations.
    61  
    62  C Instead of having separate loops for reading aligned and unaligned, we read
    63  C using MOVDQU.  This seems to work great except for core2; there performance
    64  C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
    65  C best handle the unaligned case there.
    66  
    67  C INPUT PARAMETERS
    68  define(`rp', `%rdi')
    69  define(`up', `%rsi')
    70  define(`n',  `%rdx')
    71  
    72  ABI_SUPPORT(DOS64)
    73  ABI_SUPPORT(STD64)
    74  
    75  dnl define(`movdqu', lddqu)
    76  
    77  ASM_START()
    78  	TEXT
    79  	ALIGN(16)
    80  PROLOGUE(mpn_copyd)
    81  	FUNC_ENTRY(3)
    82  
    83  	test	n, n
    84  	jz	L(don)
    85  
    86  	lea	-16(rp,n,8), rp
    87  	lea	-16(up,n,8), up
    88  
    89  	test	$8, R8(rp)		C is rp 16-byte aligned?
    90  	jz	L(ali)			C jump if rp aligned
    91  	mov	8(up), %rax
    92  	lea	-8(up), up
    93  	mov	%rax, 8(rp)
    94  	lea	-8(rp), rp
    95  	dec	n
    96  
    97  	sub	$16, n
    98  	jc	L(sma)
    99  
   100  	ALIGN(16)
   101  L(top):	movdqu	(up), %xmm0
   102  	movdqu	-16(up), %xmm1
   103  	movdqu	-32(up), %xmm2
   104  	movdqu	-48(up), %xmm3
   105  	movdqu	-64(up), %xmm4
   106  	movdqu	-80(up), %xmm5
   107  	movdqu	-96(up), %xmm6
   108  	movdqu	-112(up), %xmm7
   109  	lea	-128(up), up
   110  	movdqa	%xmm0, (rp)
   111  	movdqa	%xmm1, -16(rp)
   112  	movdqa	%xmm2, -32(rp)
   113  	movdqa	%xmm3, -48(rp)
   114  	movdqa	%xmm4, -64(rp)
   115  	movdqa	%xmm5, -80(rp)
   116  	movdqa	%xmm6, -96(rp)
   117  	movdqa	%xmm7, -112(rp)
   118  	lea	-128(rp), rp
   119  L(ali):	sub	$16, n
   120  	jnc	L(top)
   121  
   122  L(sma):	test	$8, R8(n)
   123  	jz	1f
   124  	movdqu	(up), %xmm0
   125  	movdqu	-16(up), %xmm1
   126  	movdqu	-32(up), %xmm2
   127  	movdqu	-48(up), %xmm3
   128  	lea	-64(up), up
   129  	movdqa	%xmm0, (rp)
   130  	movdqa	%xmm1, -16(rp)
   131  	movdqa	%xmm2, -32(rp)
   132  	movdqa	%xmm3, -48(rp)
   133  	lea	-64(rp), rp
   134  1:
   135  	test	$4, R8(n)
   136  	jz	1f
   137  	movdqu	(up), %xmm0
   138  	movdqu	-16(up), %xmm1
   139  	lea	-32(up), up
   140  	movdqa	%xmm0, (rp)
   141  	movdqa	%xmm1, -16(rp)
   142  	lea	-32(rp), rp
   143  1:
   144  	test	$2, R8(n)
   145  	jz	1f
   146  	movdqu	(up), %xmm0
   147  	lea	-16(up), up
   148  	movdqa	%xmm0, (rp)
   149  	lea	-16(rp), rp
   150  1:
   151  	test	$1, R8(n)
   152  	jz	1f
   153  	mov	8(up), %r8
   154  	mov	%r8, 8(rp)
   155  1:
   156  L(don):	FUNC_EXIT()
   157  	ret
   158  EPILOGUE()