github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastavx/copyd.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastavx/copyd.asm (about)

     1  dnl  AMD64 mpn_copyd optimised for CPUs with fast AVX.
     2  
     3  dnl  Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
     4  
     5  dnl  Contributed to the GNU project by Torbjörn Granlund.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb     cycles/limb     cycles/limb      good
    36  C              aligned	      unaligned	      best seen	     for cpu?
    37  C AMD K8,K9	n/a
    38  C AMD K10	n/a
    39  C AMD bull	n/a
    40  C AMD pile	 4.87		 4.87				N
    41  C AMD steam	 ?		 ?
    42  C AMD bobcat	n/a
    43  C AMD jaguar	n/a
    44  C Intel P4	n/a
    45  C Intel core	n/a
    46  C Intel NHM	n/a
    47  C Intel SBR	 0.50		 0.91				N
    48  C Intel IBR	 0.50		 0.65				N
    49  C Intel HWL	 0.25		 0.30				Y
    50  C Intel BWL	 0.28		 0.37				Y
    51  C Intel atom	n/a
    52  C VIA nano	n/a
    53  
    54  C We try to do as many 32-byte operations as possible.  The top-most and
    55  C bottom-most writes might need 8-byte operations.  For the bulk copying, we
    56  C write using aligned 32-byte operations, but we read with both aligned and
    57  C unaligned 32-byte operations.
    58  
    59  define(`rp', `%rdi')
    60  define(`up', `%rsi')
    61  define(`n',  `%rdx')
    62  
    63  ABI_SUPPORT(DOS64)
    64  ABI_SUPPORT(STD64)
    65  
    66  dnl define(`vmovdqu', vlddqu)
    67  
    68  ASM_START()
    69  	TEXT
    70  	ALIGN(32)
    71  PROLOGUE(mpn_copyd)
    72  	FUNC_ENTRY(3)
    73  
    74  	lea	-32(rp,n,8), rp
    75  	lea	-32(up,n,8), up
    76  
    77  	cmp	$7, n			C basecase needed for correctness
    78  	jbe	L(bc)
    79  
    80  	test	$8, R8(rp)		C is rp 16-byte aligned?
    81  	jz	L(a2)			C jump if rp aligned
    82  	mov	24(up), %rax
    83  	lea	-8(up), up
    84  	mov	%rax, 24(rp)
    85  	lea	-8(rp), rp
    86  	dec	n
    87  L(a2):	test	$16, R8(rp)		C is rp 32-byte aligned?
    88  	jz	L(a3)			C jump if rp aligned
    89  	vmovdqu	16(up), %xmm0
    90  	lea	-16(up), up
    91  	vmovdqa	%xmm0, 16(rp)
    92  	lea	-16(rp), rp
    93  	sub	$2, n
    94  L(a3):	sub	$16, n
    95  	jc	L(sma)
    96  
    97  	ALIGN(16)
    98  L(top):	vmovdqu	(up), %ymm0
    99  	vmovdqu	-32(up), %ymm1
   100  	vmovdqu	-64(up), %ymm2
   101  	vmovdqu	-96(up), %ymm3
   102  	lea	-128(up), up
   103  	vmovdqa	%ymm0, (rp)
   104  	vmovdqa	%ymm1, -32(rp)
   105  	vmovdqa	%ymm2, -64(rp)
   106  	vmovdqa	%ymm3, -96(rp)
   107  	lea	-128(rp), rp
   108  L(ali):	sub	$16, n
   109  	jnc	L(top)
   110  
   111  L(sma):	test	$8, R8(n)
   112  	jz	1f
   113  	vmovdqu	(up), %ymm0
   114  	vmovdqu	-32(up), %ymm1
   115  	lea	-64(up), up
   116  	vmovdqa	%ymm0, (rp)
   117  	vmovdqa	%ymm1, -32(rp)
   118  	lea	-64(rp), rp
   119  1:
   120  	test	$4, R8(n)
   121  	jz	1f
   122  	vmovdqu	(up), %ymm0
   123  	lea	-32(up), up
   124  	vmovdqa	%ymm0, (rp)
   125  	lea	-32(rp), rp
   126  1:
   127  	test	$2, R8(n)
   128  	jz	1f
   129  	vmovdqu	16(up), %xmm0
   130  	lea	-16(up), up
   131  	vmovdqa	%xmm0, 16(rp)
   132  	lea	-16(rp), rp
   133  1:
   134  	test	$1, R8(n)
   135  	jz	1f
   136  	mov	24(up), %r8
   137  	mov	%r8, 24(rp)
   138  1:
   139  	FUNC_EXIT()
   140  	ret
   141  
   142  	ALIGN(16)
   143  L(bc):	test	$4, R8(n)
   144  	jz	1f
   145  	mov	24(up), %rax
   146  	mov	16(up), %rcx
   147  	mov	8(up), %r8
   148  	mov	(up), %r9
   149  	lea	-32(up), up
   150  	mov	%rax, 24(rp)
   151  	mov	%rcx, 16(rp)
   152  	mov	%r8, 8(rp)
   153  	mov	%r9, (rp)
   154  	lea	-32(rp), rp
   155  1:
   156  	test	$2, R8(n)
   157  	jz	1f
   158  	mov	24(up), %rax
   159  	mov	16(up), %rcx
   160  	lea	-16(up), up
   161  	mov	%rax, 24(rp)
   162  	mov	%rcx, 16(rp)
   163  	lea	-16(rp), rp
   164  1:
   165  	test	$1, R8(n)
   166  	jz	1f
   167  	mov	24(up), %rax
   168  	mov	%rax, 24(rp)
   169  1:
   170  	FUNC_EXIT()
   171  	ret
   172  EPILOGUE()