github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/lshiftc-movdqu2.asm (about)

     1  dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2010-2012 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C	     cycles/limb     cycles/limb     cycles/limb    good
    37  C              aligned	      unaligned	      best seen	   for cpu?
    38  C AMD K8,K9	 3		 3		 ?	  no, use shl/shr
    39  C AMD K10	 1.8-2.0	 1.8-2.0	 ?	  yes
    40  C AMD bd1	 1.9		 1.9		 ?	  yes
    41  C AMD bobcat	 3.67		 3.67			  yes, bad for n < 20
    42  C Intel P4	 4.75		 4.75		 ?	  no, slow movdqu
    43  C Intel core2	 2.27		 2.27		 ?	  no, use shld/shrd
    44  C Intel NHM	 2.15		 2.15		 ?	  no, use shld/shrd
    45  C Intel SBR	 1.45		 1.45		 ?	  yes, bad for n = 4-6
    46  C Intel atom	12.9		12.9		 ?	  no
    47  C VIA nano	 6.18		 6.44		 ?	  no, slow movdqu
    48  
    49  C We try to do as many aligned 16-byte operations as possible.  The top-most
    50  C and bottom-most writes might need 8-byte operations.
    51  C
    52  C This variant rely on fast load movdqu, and uses it even for aligned operands,
    53  C in order to avoid the need for two separate loops.
    54  C
    55  C TODO
    56  C  * Could 2-limb wind-down code be simplified?
    57  C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
    58  C    for other affected CPUs.
    59  
    60  C INPUT PARAMETERS
    61  define(`rp',  `%rdi')
    62  define(`ap',  `%rsi')
    63  define(`n',   `%rdx')
    64  define(`cnt', `%rcx')
    65  
    66  ASM_START()
    67  	TEXT
    68  	ALIGN(64)
    69  PROLOGUE(mpn_lshiftc)
    70  	FUNC_ENTRY(4)
    71  	movd	R32(%rcx), %xmm4
    72  	mov	$64, R32(%rax)
    73  	sub	R32(%rcx), R32(%rax)
    74  	movd	R32(%rax), %xmm5
    75  
    76  	neg	R32(%rcx)
    77  	mov	-8(ap,n,8), %rax
    78  	shr	R8(%rcx), %rax
    79  
    80  	pcmpeqb	%xmm3, %xmm3		C set to 111...111
    81  
    82  	cmp	$3, n
    83  	jle	L(bc)
    84  
    85  	lea	(rp,n,8), R32(%rcx)
    86  	test	$8, R8(%rcx)
    87  	jz	L(rp_aligned)
    88  
    89  C Do one initial limb in order to make rp aligned
    90  	movq	-8(ap,n,8), %xmm0
    91  	movq	-16(ap,n,8), %xmm1
    92  	psllq	%xmm4, %xmm0
    93  	psrlq	%xmm5, %xmm1
    94  	por	%xmm1, %xmm0
    95  	pxor	%xmm3, %xmm0
    96  	movq	%xmm0, -8(rp,n,8)
    97  	dec	n
    98  
    99  L(rp_aligned):
   100  	lea	1(n), %r8d
   101  
   102  	and	$6, R32(%r8)
   103  	jz	L(ba0)
   104  	cmp	$4, R32(%r8)
   105  	jz	L(ba4)
   106  	jc	L(ba2)
   107  L(ba6):	add	$-4, n
   108  	jmp	L(i56)
   109  L(ba0):	add	$-6, n
   110  	jmp	L(i70)
   111  L(ba4):	add	$-2, n
   112  	jmp	L(i34)
   113  L(ba2):	add	$-8, n
   114  	jle	L(end)
   115  
   116  	ALIGN(16)
   117  L(top):	movdqu	40(ap,n,8), %xmm1
   118  	movdqu	48(ap,n,8), %xmm0
   119  	psllq	%xmm4, %xmm0
   120  	psrlq	%xmm5, %xmm1
   121  	por	%xmm1, %xmm0
   122  	pxor	%xmm3, %xmm0
   123  	movdqa	%xmm0, 48(rp,n,8)
   124  L(i70):
   125  	movdqu	24(ap,n,8), %xmm1
   126  	movdqu	32(ap,n,8), %xmm0
   127  	psllq	%xmm4, %xmm0
   128  	psrlq	%xmm5, %xmm1
   129  	por	%xmm1, %xmm0
   130  	pxor	%xmm3, %xmm0
   131  	movdqa	%xmm0, 32(rp,n,8)
   132  L(i56):
   133  	movdqu	8(ap,n,8), %xmm1
   134  	movdqu	16(ap,n,8), %xmm0
   135  	psllq	%xmm4, %xmm0
   136  	psrlq	%xmm5, %xmm1
   137  	por	%xmm1, %xmm0
   138  	pxor	%xmm3, %xmm0
   139  	movdqa	%xmm0, 16(rp,n,8)
   140  L(i34):
   141  	movdqu	-8(ap,n,8), %xmm1
   142  	movdqu	(ap,n,8), %xmm0
   143  	psllq	%xmm4, %xmm0
   144  	psrlq	%xmm5, %xmm1
   145  	por	%xmm1, %xmm0
   146  	pxor	%xmm3, %xmm0
   147  	movdqa	%xmm0, (rp,n,8)
   148  	sub	$8, n
   149  	jg	L(top)
   150  
   151  L(end):	test	$1, R8(n)
   152  	jnz	L(end8)
   153  
   154  	movdqu	(ap), %xmm1
   155  	pxor	%xmm0, %xmm0
   156  	punpcklqdq  %xmm1, %xmm0
   157  	psllq	%xmm4, %xmm1
   158  	psrlq	%xmm5, %xmm0
   159  	por	%xmm1, %xmm0
   160  	pxor	%xmm3, %xmm0
   161  	movdqa	%xmm0, (rp)
   162  	FUNC_EXIT()
   163  	ret
   164  
   165  C Basecase
   166  	ALIGN(16)
   167  L(bc):	dec	R32(n)
   168  	jz	L(end8)
   169  
   170  	movq	(ap,n,8), %xmm1
   171  	movq	-8(ap,n,8), %xmm0
   172  	psllq	%xmm4, %xmm1
   173  	psrlq	%xmm5, %xmm0
   174  	por	%xmm1, %xmm0
   175  	pxor	%xmm3, %xmm0
   176  	movq	%xmm0, (rp,n,8)
   177  	sub	$2, R32(n)
   178  	jl	L(end8)
   179  	movq	8(ap), %xmm1
   180  	movq	(ap), %xmm0
   181  	psllq	%xmm4, %xmm1
   182  	psrlq	%xmm5, %xmm0
   183  	por	%xmm1, %xmm0
   184  	pxor	%xmm3, %xmm0
   185  	movq	%xmm0, 8(rp)
   186  
   187  L(end8):movq	(ap), %xmm0
   188  	psllq	%xmm4, %xmm0
   189  	pxor	%xmm3, %xmm0
   190  	movq	%xmm0, (rp)
   191  	FUNC_EXIT()
   192  	ret
   193  EPILOGUE()