github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/rshift-movdqu2.asm (about)

     1  dnl  AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2010-2012 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C	     cycles/limb     cycles/limb     cycles/limb    good
    37  C              aligned	      unaligned	      best seen	   for cpu?
    38  C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
    39  C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
    40  C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
    41  C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
    42  C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
    43  C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
    44  C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
    45  C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
    46  C Intel atom	11.7		11.7		 4.5	  no
    47  C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
    48  
    49  C We try to do as many aligned 16-byte operations as possible.  The top-most
    50  C and bottom-most writes might need 8-byte operations.
    51  C
    52  C This variant rely on fast load movdqu, and uses it even for aligned operands,
    53  C in order to avoid the need for two separate loops.
    54  C
    55  C TODO
    56  C  * Could 2-limb wind-down code be simplified?
    57  C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
    58  C    for other affected CPUs.
    59  
    60  C INPUT PARAMETERS
    61  define(`rp',  `%rdi')
    62  define(`ap',  `%rsi')
    63  define(`n',   `%rdx')
    64  define(`cnt', `%rcx')
    65  
    66  ASM_START()
    67  	TEXT
    68  	ALIGN(64)
    69  PROLOGUE(mpn_rshift)
    70  	FUNC_ENTRY(4)
    71  	movd	R32(%rcx), %xmm4
    72  	mov	$64, R32(%rax)
    73  	sub	R32(%rcx), R32(%rax)
    74  	movd	R32(%rax), %xmm5
    75  
    76  	neg	R32(%rcx)
    77  	mov	(ap), %rax
    78  	shl	R8(%rcx), %rax
    79  
    80  	cmp	$3, n
    81  	jle	L(bc)
    82  
    83  	test	$8, R8(rp)
    84  	jz	L(rp_aligned)
    85  
    86  C Do one initial limb in order to make rp aligned
    87  	movq	(ap), %xmm0
    88  	movq	8(ap), %xmm1
    89  	psrlq	%xmm4, %xmm0
    90  	psllq	%xmm5, %xmm1
    91  	por	%xmm1, %xmm0
    92  	movq	%xmm0, (rp)
    93  	lea	8(ap), ap
    94  	lea	8(rp), rp
    95  	dec	n
    96  
    97  L(rp_aligned):
    98  	lea	1(n), %r8d
    99  	lea	(ap,n,8), ap
   100  	lea	(rp,n,8), rp
   101  	neg	n
   102  
   103  	and	$6, R32(%r8)
   104  	jz	L(bu0)
   105  	cmp	$4, R32(%r8)
   106  	jz	L(bu4)
   107  	jc	L(bu2)
   108  L(bu6):	add	$4, n
   109  	jmp	L(i56)
   110  L(bu0):	add	$6, n
   111  	jmp	L(i70)
   112  L(bu4):	add	$2, n
   113  	jmp	L(i34)
   114  L(bu2):	add	$8, n
   115  	jge	L(end)
   116  
   117  	ALIGN(16)
   118  L(top):	movdqu	-64(ap,n,8), %xmm1
   119  	movdqu	-56(ap,n,8), %xmm0
   120  	psllq	%xmm5, %xmm0
   121  	psrlq	%xmm4, %xmm1
   122  	por	%xmm1, %xmm0
   123  	movdqa	%xmm0, -64(rp,n,8)
   124  L(i70):
   125  	movdqu	-48(ap,n,8), %xmm1
   126  	movdqu	-40(ap,n,8), %xmm0
   127  	psllq	%xmm5, %xmm0
   128  	psrlq	%xmm4, %xmm1
   129  	por	%xmm1, %xmm0
   130  	movdqa	%xmm0, -48(rp,n,8)
   131  L(i56):
   132  	movdqu	-32(ap,n,8), %xmm1
   133  	movdqu	-24(ap,n,8), %xmm0
   134  	psllq	%xmm5, %xmm0
   135  	psrlq	%xmm4, %xmm1
   136  	por	%xmm1, %xmm0
   137  	movdqa	%xmm0, -32(rp,n,8)
   138  L(i34):
   139  	movdqu	-16(ap,n,8), %xmm1
   140  	movdqu	-8(ap,n,8), %xmm0
   141  	psllq	%xmm5, %xmm0
   142  	psrlq	%xmm4, %xmm1
   143  	por	%xmm1, %xmm0
   144  	movdqa	%xmm0, -16(rp,n,8)
   145  	add	$8, n
   146  	jl	L(top)
   147  
   148  L(end):	test	$1, R8(n)
   149  	jnz	L(e1)
   150  
   151  	movdqu	-16(ap), %xmm1
   152  	movq	-8(ap), %xmm0
   153  	psrlq	%xmm4, %xmm1
   154  	psllq	%xmm5, %xmm0
   155  	por	%xmm1, %xmm0
   156  	movdqa	%xmm0, -16(rp)
   157  	FUNC_EXIT()
   158  	ret
   159  
   160  L(e1):	movq	-8(ap), %xmm0
   161  	psrlq	%xmm4, %xmm0
   162  	movq	%xmm0, -8(rp)
   163  	FUNC_EXIT()
   164  	ret
   165  
   166  C Basecase
   167  	ALIGN(16)
   168  L(bc):	dec	R32(n)
   169  	jnz	1f
   170  	movq	(ap), %xmm0
   171  	psrlq	%xmm4, %xmm0
   172  	movq	%xmm0, (rp)
   173  	FUNC_EXIT()
   174  	ret
   175  
   176  1:	movq	(ap), %xmm1
   177  	movq	8(ap), %xmm0
   178  	psrlq	%xmm4, %xmm1
   179  	psllq	%xmm5, %xmm0
   180  	por	%xmm1, %xmm0
   181  	movq	%xmm0, (rp)
   182  	dec	R32(n)
   183  	jnz	1f
   184  	movq	8(ap), %xmm0
   185  	psrlq	%xmm4, %xmm0
   186  	movq	%xmm0, 8(rp)
   187  	FUNC_EXIT()
   188  	ret
   189  
   190  1:	movq	8(ap), %xmm1
   191  	movq	16(ap), %xmm0
   192  	psrlq	%xmm4, %xmm1
   193  	psllq	%xmm5, %xmm0
   194  	por	%xmm1, %xmm0
   195  	movq	%xmm0,	8(rp)
   196  	movq	16(ap), %xmm0
   197  	psrlq	%xmm4, %xmm0
   198  	movq	%xmm0, 16(rp)
   199  	FUNC_EXIT()
   200  	ret
   201  EPILOGUE()