github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/pentium4/rshift.asm (about)

     1  dnl  x86-64 mpn_rshift optimized for Pentium 4.
     2  
     3  dnl  Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C	     cycles/limb
    35  C AMD K8,K9	 2.5
    36  C AMD K10	 ?
    37  C Intel P4	 3.29
    38  C Intel core2	 2.1 (fluctuates, presumably cache related)
    39  C Intel corei	 ?
    40  C Intel atom	14.3
    41  C VIA nano	 ?
    42  
    43  C INPUT PARAMETERS
    44  define(`rp',`%rdi')
    45  define(`up',`%rsi')
    46  define(`n',`%rdx')
    47  define(`cnt',`%cl')
    48  
    49  ABI_SUPPORT(DOS64)
    50  ABI_SUPPORT(STD64)
    51  
    52  ASM_START()
    53  	TEXT
    54  	ALIGN(32)
    55  PROLOGUE(mpn_rshift)
    56  	FUNC_ENTRY(4)
    57  	mov	(up), %rax
    58  	movd	R32(%rcx), %mm4
    59  	neg	R32(%rcx)			C put lsh count in cl
    60  	and	$63, R32(%rcx)
    61  	movd	R32(%rcx), %mm5
    62  
    63  	lea	-8(up,n,8), up
    64  	lea	-8(rp,n,8), rp
    65  	lea	1(n), R32(%r8)
    66  	neg	n
    67  
    68  	shl	R8(%rcx), %rax		C function return value
    69  
    70  	and	$3, R32(%r8)
    71  	je	L(rol)			C jump for n = 3, 7, 11, ...
    72  
    73  	dec	R32(%r8)
    74  	jne	L(1)
    75  C	n = 4, 8, 12, ...
    76  	movq	8(up,n,8), %mm2
    77  	psrlq	%mm4, %mm2
    78  	movq	16(up,n,8), %mm0
    79  	psllq	%mm5, %mm0
    80  	por	%mm0, %mm2
    81  	movq	%mm2, 8(rp,n,8)
    82  	inc	n
    83  	jmp	L(rol)
    84  
    85  L(1):	dec	R32(%r8)
    86  	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
    87  C	n = 2, 6, 10, 16, ...
    88  	movq	8(up,n,8), %mm2
    89  	psrlq	%mm4, %mm2
    90  	movq	16(up,n,8), %mm0
    91  	psllq	%mm5, %mm0
    92  	por	%mm0, %mm2
    93  	movq	%mm2, 8(rp,n,8)
    94  	inc	n
    95  L(1x):
    96  	cmp	$-1, n
    97  	je	L(ast)
    98  	movq	8(up,n,8), %mm2
    99  	psrlq	%mm4, %mm2
   100  	movq	16(up,n,8), %mm3
   101  	psrlq	%mm4, %mm3
   102  	movq	16(up,n,8), %mm0
   103  	movq	24(up,n,8), %mm1
   104  	psllq	%mm5, %mm0
   105  	por	%mm0, %mm2
   106  	psllq	%mm5, %mm1
   107  	por	%mm1, %mm3
   108  	movq	%mm2, 8(rp,n,8)
   109  	movq	%mm3, 16(rp,n,8)
   110  	add	$2, n
   111  
   112  L(rol):	movq	8(up,n,8), %mm2
   113  	psrlq	%mm4, %mm2
   114  	movq	16(up,n,8), %mm3
   115  	psrlq	%mm4, %mm3
   116  
   117  	add	$4, n			C				      4
   118  	jb	L(end)			C				      2
   119  	ALIGN(32)
   120  L(top):
   121  	C finish stuff from lsh block
   122  	movq	-16(up,n,8), %mm0
   123  	movq	-8(up,n,8), %mm1
   124  	psllq	%mm5, %mm0
   125  	por	%mm0, %mm2
   126  	psllq	%mm5, %mm1
   127  	movq	(up,n,8), %mm0
   128  	por	%mm1, %mm3
   129  	movq	8(up,n,8), %mm1
   130  	movq	%mm2, -24(rp,n,8)
   131  	movq	%mm3, -16(rp,n,8)
   132  	C start two new rsh
   133  	psllq	%mm5, %mm0
   134  	psllq	%mm5, %mm1
   135  
   136  	C finish stuff from rsh block
   137  	movq	-8(up,n,8), %mm2
   138  	movq	(up,n,8), %mm3
   139  	psrlq	%mm4, %mm2
   140  	por	%mm2, %mm0
   141  	psrlq	%mm4, %mm3
   142  	movq	8(up,n,8), %mm2
   143  	por	%mm3, %mm1
   144  	movq	16(up,n,8), %mm3
   145  	movq	%mm0, -8(rp,n,8)
   146  	movq	%mm1, (rp,n,8)
   147  	C start two new lsh
   148  	add	$4, n
   149  	psrlq	%mm4, %mm2
   150  	psrlq	%mm4, %mm3
   151  
   152  	jae	L(top)			C				      2
   153  L(end):
   154  	movq	-8(up), %mm0
   155  	psllq	%mm5, %mm0
   156  	por	%mm0, %mm2
   157  	movq	(up), %mm1
   158  	psllq	%mm5, %mm1
   159  	por	%mm1, %mm3
   160  	movq	%mm2, -16(rp)
   161  	movq	%mm3, -8(rp)
   162  
   163  L(ast):	movq	(up), %mm2
   164  	psrlq	%mm4, %mm2
   165  	movq	%mm2, (rp)
   166  	emms
   167  	FUNC_EXIT()
   168  	ret
   169  EPILOGUE()