github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/pentium4/lshiftc.asm (about)

     1  dnl  x86-64 mpn_lshiftc optimized for Pentium 4.
     2  
     3  dnl  Copyright 2003, 2005, 2007, 2008, 2010, 2012 Free Software Foundation,
     4  dnl  Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9	 ?
    37  C AMD K10	 ?
    38  C Intel P4	 4.15
    39  C Intel core2	 ?
    40  C Intel corei	 ?
    41  C Intel atom	 ?
    42  C VIA nano	 ?
    43  
    44  C INPUT PARAMETERS
    45  define(`rp',`%rdi')
    46  define(`up',`%rsi')
    47  define(`n',`%rdx')
    48  define(`cnt',`%cl')
    49  
    50  ABI_SUPPORT(DOS64)
    51  ABI_SUPPORT(STD64)
    52  
    53  ASM_START()
    54  	TEXT
    55  	ALIGN(32)
    56  PROLOGUE(mpn_lshiftc)
    57  	FUNC_ENTRY(4)
    58  	mov	-8(up,n,8), %rax
    59  	pcmpeqd	%mm6, %mm6		C 0xffff...fff
    60  	movd	R32(%rcx), %mm4
    61  	neg	R32(%rcx)		C put rsh count in cl
    62  	and	$63, R32(%rcx)
    63  	movd	R32(%rcx), %mm5
    64  
    65  	lea	1(n), R32(%r8)
    66  
    67  	shr	R8(%rcx), %rax		C function return value
    68  
    69  	and	$3, R32(%r8)
    70  	je	L(rol)			C jump for n = 3, 7, 11, ...
    71  
    72  	dec	R32(%r8)
    73  	jne	L(1)
    74  C	n = 4, 8, 12, ...
    75  	movq	-8(up,n,8), %mm2
    76  	psllq	%mm4, %mm2
    77  	movq	-16(up,n,8), %mm0
    78  	pxor	%mm6, %mm2
    79  	psrlq	%mm5, %mm0
    80  	pandn	%mm2, %mm0
    81  	movq	%mm0, -8(rp,n,8)
    82  	dec	n
    83  	jmp	L(rol)
    84  
    85  L(1):	dec	R32(%r8)
    86  	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
    87  C	n = 2, 6, 10, 16, ...
    88  	movq	-8(up,n,8), %mm2
    89  	psllq	%mm4, %mm2
    90  	movq	-16(up,n,8), %mm0
    91  	pxor	%mm6, %mm2
    92  	psrlq	%mm5, %mm0
    93  	pandn	%mm2, %mm0
    94  	movq	%mm0, -8(rp,n,8)
    95  	dec	n
    96  L(1x):
    97  	cmp	$1, n
    98  	je	L(ast)
    99  	movq	-8(up,n,8), %mm2
   100  	psllq	%mm4, %mm2
   101  	movq	-16(up,n,8), %mm3
   102  	psllq	%mm4, %mm3
   103  	movq	-16(up,n,8), %mm0
   104  	movq	-24(up,n,8), %mm1
   105  	pxor	%mm6, %mm2
   106  	psrlq	%mm5, %mm0
   107  	pandn	%mm2, %mm0
   108  	pxor	%mm6, %mm3
   109  	psrlq	%mm5, %mm1
   110  	pandn	%mm3, %mm1
   111  	movq	%mm0, -8(rp,n,8)
   112  	movq	%mm1, -16(rp,n,8)
   113  	sub	$2, n
   114  
   115  L(rol):	movq	-8(up,n,8), %mm2
   116  	psllq	%mm4, %mm2
   117  	movq	-16(up,n,8), %mm3
   118  	psllq	%mm4, %mm3
   119  
   120  	sub	$4, n
   121  	jb	L(end)
   122  	ALIGN(32)
   123  L(top):
   124  	C finish stuff from lsh block
   125  	movq	16(up,n,8), %mm0
   126  	pxor	%mm6, %mm2
   127  	movq	8(up,n,8), %mm1
   128  	psrlq	%mm5, %mm0
   129  	psrlq	%mm5, %mm1
   130  	pandn	%mm2, %mm0
   131  	pxor	%mm6, %mm3
   132  	movq	%mm0, 24(rp,n,8)
   133  	movq	(up,n,8), %mm0
   134  	pandn	%mm3, %mm1
   135  	movq	%mm1, 16(rp,n,8)
   136  	movq	-8(up,n,8), %mm1
   137  	C start two new rsh
   138  	psrlq	%mm5, %mm0
   139  	psrlq	%mm5, %mm1
   140  
   141  	C finish stuff from rsh block
   142  	movq	8(up,n,8), %mm2
   143  	pxor	%mm6, %mm0
   144  	movq	(up,n,8), %mm3
   145  	psllq	%mm4, %mm2
   146  	psllq	%mm4, %mm3
   147  	pandn	%mm0, %mm2
   148  	pxor	%mm6, %mm1
   149  	movq	%mm2, 8(rp,n,8)
   150  	movq	-8(up,n,8), %mm2
   151  	pandn	%mm1, %mm3
   152  	movq	%mm3, (rp,n,8)
   153  	movq	-16(up,n,8), %mm3
   154  	C start two new lsh
   155  	sub	$4, n
   156  	psllq	%mm4, %mm2
   157  	psllq	%mm4, %mm3
   158  
   159  	jae	L(top)
   160  
   161  L(end):	pxor	%mm6, %mm2
   162  	movq	8(up), %mm0
   163  	psrlq	%mm5, %mm0
   164  	pandn	%mm2, %mm0
   165  	pxor	%mm6, %mm3
   166  	movq	(up), %mm1
   167  	psrlq	%mm5, %mm1
   168  	pandn	%mm3, %mm1
   169  	movq	%mm0, 16(rp)
   170  	movq	%mm1, 8(rp)
   171  
   172  L(ast):	movq	(up), %mm2
   173  	psllq	%mm4, %mm2
   174  	pxor	%mm6, %mm2
   175  	movq	%mm2, (rp)
   176  	emms
   177  	FUNC_EXIT()
   178  	ret
   179  EPILOGUE()