github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/com.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/com.asm (about)

     1  dnl  AMD64 mpn_com optimised for CPUs with fast SSE.
     2  
     3  dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
     4  dnl  Inc.
     5  
     6  dnl  Contributed to the GNU project by Torbjorn Granlund.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  C	     cycles/limb     cycles/limb     cycles/limb      good
    37  C              aligned	      unaligned	      best seen	     for cpu?
    38  C AMD K8,K9	 2.0		 2.0				N
    39  C AMD K10	 0.85		 1.3				Y/N
    40  C AMD bull	 1.40		 1.40				Y
    41  C AMD pile     0.9-1.4	       0.9-1.4				Y
    42  C AMD steam
    43  C AMD excavator
    44  C AMD bobcat	 3.1		 3.1				N
    45  C AMD jaguar	 0.91		 0.91		opt/opt		Y
    46  C Intel P4	 2.28		 illop				Y
    47  C Intel core2	 1.02		 1.02				N
    48  C Intel NHM	 0.53		 0.68				Y
    49  C Intel SBR	 0.51		 0.75		opt/0.65	Y/N
    50  C Intel IBR	 0.50		 0.57		opt/opt		Y
    51  C Intel HWL	 0.51		 0.64		opt/0.58	Y
    52  C Intel BWL	 0.61		 0.65		0.57/opt	Y
    53  C Intel atom	 3.68		 3.68				N
    54  C Intel SLM	 1.09		 1.35				N
    55  C VIA nano	 1.17		 5.09				Y/N
    56  
    57  C We try to do as many 16-byte operations as possible.  The top-most and
    58  C bottom-most writes might need 8-byte operations.  We can always write using
    59  C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
    60  C operations.
    61  
    62  C Instead of having separate loops for reading aligned and unaligned, we read
    63  C using MOVDQU.  This seems to work great except for core2; there performance
    64  C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
    65  C best handle the unaligned case there.
    66  
    67  C INPUT PARAMETERS
    68  define(`rp', `%rdi')
    69  define(`up', `%rsi')
    70  define(`n',  `%rdx')
    71  
    72  ABI_SUPPORT(DOS64)
    73  ABI_SUPPORT(STD64)
    74  
    75  ASM_START()
    76  	TEXT
    77  	ALIGN(16)
    78  PROLOGUE(mpn_com)
    79  	FUNC_ENTRY(3)
    80  
    81  	pcmpeqb	%xmm7, %xmm7		C set to 111...111
    82  
    83  	test	$8, R8(rp)		C is rp 16-byte aligned?
    84  	jz	L(ali)			C jump if rp aligned
    85  	mov	(up), %rax
    86  	lea	8(up), up
    87  	not	%rax
    88  	mov	%rax, (rp)
    89  	lea	8(rp), rp
    90  	dec	n
    91  
    92  	sub	$14, n
    93  	jc	L(sma)
    94  
    95  	ALIGN(16)
    96  L(top):	movdqu	(up), %xmm0
    97  	movdqu	16(up), %xmm1
    98  	movdqu	32(up), %xmm2
    99  	movdqu	48(up), %xmm3
   100  	movdqu	64(up), %xmm4
   101  	movdqu	80(up), %xmm5
   102  	movdqu	96(up), %xmm6
   103  	lea	112(up), up
   104  	pxor	%xmm7, %xmm0
   105  	pxor	%xmm7, %xmm1
   106  	pxor	%xmm7, %xmm2
   107  	pxor	%xmm7, %xmm3
   108  	pxor	%xmm7, %xmm4
   109  	pxor	%xmm7, %xmm5
   110  	pxor	%xmm7, %xmm6
   111  	movdqa	%xmm0, (rp)
   112  	movdqa	%xmm1, 16(rp)
   113  	movdqa	%xmm2, 32(rp)
   114  	movdqa	%xmm3, 48(rp)
   115  	movdqa	%xmm4, 64(rp)
   116  	movdqa	%xmm5, 80(rp)
   117  	movdqa	%xmm6, 96(rp)
   118  	lea	112(rp), rp
   119  L(ali):	sub	$14, n
   120  	jnc	L(top)
   121  
   122  L(sma):	add	$14, n
   123  	test	$8, R8(n)
   124  	jz	1f
   125  	movdqu	(up), %xmm0
   126  	movdqu	16(up), %xmm1
   127  	movdqu	32(up), %xmm2
   128  	movdqu	48(up), %xmm3
   129  	lea	64(up), up
   130  	pxor	%xmm7, %xmm0
   131  	pxor	%xmm7, %xmm1
   132  	pxor	%xmm7, %xmm2
   133  	pxor	%xmm7, %xmm3
   134  	movdqa	%xmm0, (rp)
   135  	movdqa	%xmm1, 16(rp)
   136  	movdqa	%xmm2, 32(rp)
   137  	movdqa	%xmm3, 48(rp)
   138  	lea	64(rp), rp
   139  1:
   140  	test	$4, R8(n)
   141  	jz	1f
   142  	movdqu	(up), %xmm0
   143  	movdqu	16(up), %xmm1
   144  	lea	32(up), up
   145  	pxor	%xmm7, %xmm0
   146  	pxor	%xmm7, %xmm1
   147  	movdqa	%xmm0, (rp)
   148  	movdqa	%xmm1, 16(rp)
   149  	lea	32(rp), rp
   150  1:
   151  	test	$2, R8(n)
   152  	jz	1f
   153  	movdqu	(up), %xmm0
   154  	lea	16(up), up
   155  	pxor	%xmm7, %xmm0
   156  	movdqa	%xmm0, (rp)
   157  	lea	16(rp), rp
   158  1:
   159  	test	$1, R8(n)
   160  	jz	1f
   161  	mov	(up), %rax
   162  	not	%rax
   163  	mov	%rax, (rp)
   164  1:
   165  L(don):	FUNC_EXIT()
   166  	ret
   167  EPILOGUE()