github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/lshiftc.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/lshiftc.asm (about)

     1  dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
     2  
     3  dnl  Contributed to the GNU project by David Harvey and Torbjorn Granlund.
     4  
     5  dnl  Copyright 2010-2012 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C	     cycles/limb	     cycles/limb	      good
    37  C          16-byte aligned         16-byte unaligned	    for cpu?
    38  C AMD K8,K9	 ?			 ?
    39  C AMD K10	 1.85  (1.635)		 1.9   (1.67)		Y
    40  C AMD bd1	 1.82  (1.75)		 1.82  (1.75)		Y
    41  C AMD bobcat	 4.5			 4.5
    42  C Intel P4	 3.6   (3.125)		 3.6   (3.125)		Y
    43  C Intel core2	 2.05  (1.67)		 2.55  (1.75)
    44  C Intel NHM	 2.05  (1.875)		 2.6   (2.25)
    45  C Intel SBR	 1.55  (1.44)		 2     (1.57)		Y
    46  C Intel atom	 ?			 ?
    47  C VIA nano	 2.5   (2.5)		 2.5   (2.5)		Y
    48  
    49  C We try to do as many 16-byte operations as possible.  The top-most and
    50  C bottom-most writes might need 8-byte operations.  We always write using
    51  C 16-byte operations, we read with both 8-byte and 16-byte operations.
    52  
    53  C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
    54  C not true.  The aligned case reads 16+8 bytes, the unaligned case reads
    55  C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
    56  
    57  C This is not yet great code:
    58  C   (1) The unaligned case makes too many reads.
    59  C   (2) We should do some unrolling, at least 2-way.
    60  C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
    61  C Nano.
    62  
    63  C INPUT PARAMETERS
    64  define(`rp',  `%rdi')
    65  define(`ap',  `%rsi')
    66  define(`n',   `%rdx')
    67  define(`cnt', `%rcx')
    68  
    69  ASM_START()
    70  	TEXT
    71  	ALIGN(16)
    72  PROLOGUE(mpn_lshiftc)
    73  	movd	R32(%rcx), %xmm4
    74  	mov	$64, R32(%rax)
    75  	sub	R32(%rcx), R32(%rax)
    76  	movd	R32(%rax), %xmm5
    77  
    78  	neg	R32(%rcx)
    79  	mov	-8(ap,n,8), %rax
    80  	shr	R8(%rcx), %rax
    81  
    82  	pcmpeqb	%xmm7, %xmm7		C set to 111...111
    83  
    84  	cmp	$2, n
    85  	jle	L(le2)
    86  
    87  	lea	(rp,n,8), R32(%rcx)
    88  	test	$8, R8(%rcx)
    89  	je	L(rp_aligned)
    90  
    91  C Do one initial limb in order to make rp aligned
    92  	movq	-8(ap,n,8), %xmm0
    93  	movq	-16(ap,n,8), %xmm1
    94  	psllq	%xmm4, %xmm0
    95  	psrlq	%xmm5, %xmm1
    96  	por	%xmm1, %xmm0
    97  	pxor	%xmm7, %xmm0
    98  	movq	%xmm0, -8(rp,n,8)
    99  	dec	n
   100  
   101  L(rp_aligned):
   102  	lea	(ap,n,8), R32(%rcx)
   103  	test	$8, R8(%rcx)
   104  	je	L(aent)
   105  	jmp	L(uent)
   106  C *****************************************************************************
   107  
   108  C Handle the case when ap != rp (mod 16).
   109  
   110  	ALIGN(16)
   111  L(utop):movq	(ap,n,8), %xmm1
   112  	punpcklqdq  8(ap,n,8), %xmm1
   113  	movdqa	-8(ap,n,8), %xmm0
   114  	psllq	%xmm4, %xmm1
   115  	psrlq	%xmm5, %xmm0
   116  	por	%xmm1, %xmm0
   117  	pxor	%xmm7, %xmm0
   118  	movdqa	%xmm0, (rp,n,8)
   119  L(uent):sub	$2, n
   120  	ja	L(utop)
   121  
   122  	jne	L(end8)
   123  
   124  	movq	(ap), %xmm1
   125  	pxor	%xmm0, %xmm0
   126  	punpcklqdq  %xmm1, %xmm0
   127  	punpcklqdq  8(ap), %xmm1
   128  	psllq	%xmm4, %xmm1
   129  	psrlq	%xmm5, %xmm0
   130  	por	%xmm1, %xmm0
   131  	pxor	%xmm7, %xmm0
   132  	movdqa	%xmm0, (rp)
   133  	ret
   134  C *****************************************************************************
   135  
   136  C Handle the case when ap = rp (mod 16).
   137  
   138  	ALIGN(16)
   139  L(atop):movdqa	(ap,n,8), %xmm0		C xmm0 = B*ap[n-1] + ap[n-2]
   140  	movq	-8(ap,n,8), %xmm1	C xmm1 = ap[n-3]
   141  	punpcklqdq  %xmm0, %xmm1	C xmm1 = B*ap[n-2] + ap[n-3]
   142  	psllq	%xmm4, %xmm0
   143  	psrlq	%xmm5, %xmm1
   144  	por	%xmm1, %xmm0
   145  	pxor	%xmm7, %xmm0
   146  	movdqa	%xmm0, (rp,n,8)
   147  L(aent):sub	$2, n
   148  	ja	L(atop)
   149  
   150  	jne	L(end8)
   151  
   152  	movdqa	(ap), %xmm0
   153  	pxor	%xmm1, %xmm1
   154  	punpcklqdq  %xmm0, %xmm1
   155  	psllq	%xmm4, %xmm0
   156  	psrlq	%xmm5, %xmm1
   157  	por	%xmm1, %xmm0
   158  	pxor	%xmm7, %xmm0
   159  	movdqa	%xmm0, (rp)
   160  	ret
   161  C *****************************************************************************
   162  
   163  	ALIGN(16)
   164  L(le2):	jne	L(end8)
   165  
   166  	movq	8(ap), %xmm0
   167  	movq	(ap), %xmm1
   168  	psllq	%xmm4, %xmm0
   169  	psrlq	%xmm5, %xmm1
   170  	por	%xmm1, %xmm0
   171  	pxor	%xmm7, %xmm0
   172  	movq	%xmm0, 8(rp)
   173  
   174  L(end8):movq	(ap), %xmm0
   175  	psllq	%xmm4, %xmm0
   176  	pxor	%xmm7, %xmm0
   177  	movq	%xmm0, (rp)
   178  	ret
   179  EPILOGUE()