github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastavx/copyi.asm (about)

     1  dnl  AMD64 mpn_copyi optimised for CPUs with fast AVX.
     2  
     3  dnl  Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
     4  
     5  dnl  Contributed to the GNU project by Torbjörn Granlund.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb     cycles/limb     cycles/limb      good
    36  C              aligned	      unaligned	      best seen	     for cpu?
    37  C AMD K8,K9	n/a
    38  C AMD K10	n/a
    39  C AMD bull	n/a
    40  C AMD pile	 4.87		 4.87				N
    41  C AMD steam	 ?		 ?
    42  C AMD bobcat	n/a
    43  C AMD jaguar	n/a
    44  C Intel P4	n/a
    45  C Intel core	n/a
    46  C Intel NHM	n/a
    47  C Intel SBR	 0.50		 0.91				N
    48  C Intel IBR	 0.50		 0.65				N
    49  C Intel HWL	 0.25		 0.30				Y
    50  C Intel BWL	 0.28		 0.37				Y
    51  C Intel atom	n/a
    52  C VIA nano	n/a
    53  
    54  C We try to do as many 32-byte operations as possible.  The top-most and
    55  C bottom-most writes might need 8-byte operations.  For the bulk copying, we
    56  C write using aligned 32-byte operations, but we read with both aligned and
    57  C unaligned 32-byte operations.
    58  
    59  define(`rp', `%rdi')
    60  define(`up', `%rsi')
    61  define(`n',  `%rdx')
    62  
    63  ABI_SUPPORT(DOS64)
    64  ABI_SUPPORT(STD64)
    65  
    66  dnl define(`vmovdqu', vlddqu)
    67  
    68  ASM_START()
    69  	TEXT
    70  	ALIGN(32)
    71  PROLOGUE(mpn_copyi)
    72  	FUNC_ENTRY(3)
    73  
    74  	cmp	$7, n
    75  	jbe	L(bc)
    76  
    77  	test	$8, R8(rp)		C is rp 16-byte aligned?
    78  	jz	L(a2)			C jump if rp aligned
    79  	mov	(up), %rax
    80  	lea	8(up), up
    81  	mov	%rax, (rp)
    82  	lea	8(rp), rp
    83  	dec	n
    84  L(a2):	test	$16, R8(rp)		C is rp 32-byte aligned?
    85  	jz	L(a3)			C jump if rp aligned
    86  	vmovdqu	(up), %xmm0
    87  	lea	16(up), up
    88  	vmovdqa	%xmm0, (rp)
    89  	lea	16(rp), rp
    90  	sub	$2, n
    91  L(a3):	sub	$16, n
    92  	jc	L(sma)
    93  
    94  	ALIGN(16)
    95  L(top):	vmovdqu	(up), %ymm0
    96  	vmovdqu	32(up), %ymm1
    97  	vmovdqu	64(up), %ymm2
    98  	vmovdqu	96(up), %ymm3
    99  	lea	128(up), up
   100  	vmovdqa	%ymm0, (rp)
   101  	vmovdqa	%ymm1, 32(rp)
   102  	vmovdqa	%ymm2, 64(rp)
   103  	vmovdqa	%ymm3, 96(rp)
   104  	lea	128(rp), rp
   105  L(ali):	sub	$16, n
   106  	jnc	L(top)
   107  
   108  L(sma):	test	$8, R8(n)
   109  	jz	1f
   110  	vmovdqu	(up), %ymm0
   111  	vmovdqu	32(up), %ymm1
   112  	lea	64(up), up
   113  	vmovdqa	%ymm0, (rp)
   114  	vmovdqa	%ymm1, 32(rp)
   115  	lea	64(rp), rp
   116  1:
   117  	test	$4, R8(n)
   118  	jz	1f
   119  	vmovdqu	(up), %ymm0
   120  	lea	32(up), up
   121  	vmovdqa	%ymm0, (rp)
   122  	lea	32(rp), rp
   123  1:
   124  	test	$2, R8(n)
   125  	jz	1f
   126  	vmovdqu	(up), %xmm0
   127  	lea	16(up), up
   128  	vmovdqa	%xmm0, (rp)
   129  	lea	16(rp), rp
   130  1:
   131  L(end):	test	$1, R8(n)
   132  	jz	1f
   133  	mov	(up), %r8
   134  	mov	%r8, (rp)
   135  1:
   136  	FUNC_EXIT()
   137  	ret
   138  
   139  	ALIGN(16)
   140  L(bc):	test	$4, R8(n)
   141  	jz	1f
   142  	mov	(up), %rax
   143  	mov	8(up), %rcx
   144  	mov	16(up), %r8
   145  	mov	24(up), %r9
   146  	lea	32(up), up
   147  	mov	%rax, (rp)
   148  	mov	%rcx, 8(rp)
   149  	mov	%r8, 16(rp)
   150  	mov	%r9, 24(rp)
   151  	lea	32(rp), rp
   152  1:
   153  	test	$2, R8(n)
   154  	jz	1f
   155  	mov	(up), %rax
   156  	mov	8(up), %rcx
   157  	lea	16(up), up
   158  	mov	%rax, (rp)
   159  	mov	%rcx, 8(rp)
   160  	lea	16(rp), rp
   161  1:
   162  	test	$1, R8(n)
   163  	jz	1f
   164  	mov	(up), %rax
   165  	mov	%rax, (rp)
   166  1:
   167  	FUNC_EXIT()
   168  	ret
   169  EPILOGUE()