github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/lshift-movdqu2.asm (about)

     1  dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2010-2012 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C	     cycles/limb     cycles/limb     cycles/limb    good
    37  C              aligned	      unaligned	      best seen	   for cpu?
    38  C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
    39  C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
    40  C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
    41  C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
    42  C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
    43  C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
    44  C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
    45  C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
    46  C Intel atom	11.7		11.7		 4.5	  no
    47  C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
    48  
    49  C We try to do as many aligned 16-byte operations as possible.  The top-most
    50  C and bottom-most writes might need 8-byte operations.
    51  C
    52  C This variant rely on fast load movdqu, and uses it even for aligned operands,
    53  C in order to avoid the need for two separate loops.
    54  C
    55  C TODO
    56  C  * Could 2-limb wind-down code be simplified?
    57  C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
    58  C    for other affected CPUs.
    59  
    60  C INPUT PARAMETERS
    61  define(`rp',  `%rdi')
    62  define(`ap',  `%rsi')
    63  define(`n',   `%rdx')
    64  define(`cnt', `%rcx')
    65  
    66  ASM_START()
    67  	TEXT
    68  	ALIGN(64)
    69  PROLOGUE(mpn_lshift)
    70  	FUNC_ENTRY(4)
    71  	movd	R32(%rcx), %xmm4
    72  	mov	$64, R32(%rax)
    73  	sub	R32(%rcx), R32(%rax)
    74  	movd	R32(%rax), %xmm5
    75  
    76  	neg	R32(%rcx)
    77  	mov	-8(ap,n,8), %rax
    78  	shr	R8(%rcx), %rax
    79  
    80  	cmp	$3, n
    81  	jle	L(bc)
    82  
    83  	lea	(rp,n,8), R32(%rcx)
    84  	test	$8, R8(%rcx)
    85  	jz	L(rp_aligned)
    86  
    87  C Do one initial limb in order to make rp aligned
    88  	movq	-8(ap,n,8), %xmm0
    89  	movq	-16(ap,n,8), %xmm1
    90  	psllq	%xmm4, %xmm0
    91  	psrlq	%xmm5, %xmm1
    92  	por	%xmm1, %xmm0
    93  	movq	%xmm0, -8(rp,n,8)
    94  	dec	n
    95  
    96  L(rp_aligned):
    97  	lea	1(n), %r8d
    98  
    99  	and	$6, R32(%r8)
   100  	jz	L(ba0)
   101  	cmp	$4, R32(%r8)
   102  	jz	L(ba4)
   103  	jc	L(ba2)
   104  L(ba6):	add	$-4, n
   105  	jmp	L(i56)
   106  L(ba0):	add	$-6, n
   107  	jmp	L(i70)
   108  L(ba4):	add	$-2, n
   109  	jmp	L(i34)
   110  L(ba2):	add	$-8, n
   111  	jle	L(end)
   112  
   113  	ALIGN(16)
   114  L(top):	movdqu	40(ap,n,8), %xmm1
   115  	movdqu	48(ap,n,8), %xmm0
   116  	psllq	%xmm4, %xmm0
   117  	psrlq	%xmm5, %xmm1
   118  	por	%xmm1, %xmm0
   119  	movdqa	%xmm0, 48(rp,n,8)
   120  L(i70):
   121  	movdqu	24(ap,n,8), %xmm1
   122  	movdqu	32(ap,n,8), %xmm0
   123  	psllq	%xmm4, %xmm0
   124  	psrlq	%xmm5, %xmm1
   125  	por	%xmm1, %xmm0
   126  	movdqa	%xmm0, 32(rp,n,8)
   127  L(i56):
   128  	movdqu	8(ap,n,8), %xmm1
   129  	movdqu	16(ap,n,8), %xmm0
   130  	psllq	%xmm4, %xmm0
   131  	psrlq	%xmm5, %xmm1
   132  	por	%xmm1, %xmm0
   133  	movdqa	%xmm0, 16(rp,n,8)
   134  L(i34):
   135  	movdqu	-8(ap,n,8), %xmm1
   136  	movdqu	(ap,n,8), %xmm0
   137  	psllq	%xmm4, %xmm0
   138  	psrlq	%xmm5, %xmm1
   139  	por	%xmm1, %xmm0
   140  	movdqa	%xmm0, (rp,n,8)
   141  	sub	$8, n
   142  	jg	L(top)
   143  
   144  L(end):	test	$1, R8(n)
   145  	jnz	L(end8)
   146  
   147  	movdqu	(ap), %xmm1
   148  	pxor	%xmm0, %xmm0
   149  	punpcklqdq  %xmm1, %xmm0
   150  	psllq	%xmm4, %xmm1
   151  	psrlq	%xmm5, %xmm0
   152  	por	%xmm1, %xmm0
   153  	movdqa	%xmm0, (rp)
   154  	FUNC_EXIT()
   155  	ret
   156  
   157  C Basecase
   158  	ALIGN(16)
   159  L(bc):	dec	R32(n)
   160  	jz	L(end8)
   161  
   162  	movq	(ap,n,8), %xmm1
   163  	movq	-8(ap,n,8), %xmm0
   164  	psllq	%xmm4, %xmm1
   165  	psrlq	%xmm5, %xmm0
   166  	por	%xmm1, %xmm0
   167  	movq	%xmm0, (rp,n,8)
   168  	sub	$2, R32(n)
   169  	jl	L(end8)
   170  	movq	8(ap), %xmm1
   171  	movq	(ap), %xmm0
   172  	psllq	%xmm4, %xmm1
   173  	psrlq	%xmm5, %xmm0
   174  	por	%xmm1, %xmm0
   175  	movq	%xmm0, 8(rp)
   176  
   177  L(end8):movq	(ap), %xmm0
   178  	psllq	%xmm4, %xmm0
   179  	movq	%xmm0, (rp)
   180  	FUNC_EXIT()
   181  	ret
   182  EPILOGUE()