github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/core2/lshift.asm (about)

     1  dnl  x86-64 mpn_lshift optimized for "Core 2".
     2  
     3  dnl  Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C	     cycles/limb
    35  C AMD K8,K9	 4.25
    36  C AMD K10	 4.25
    37  C Intel P4	14.7
    38  C Intel core2	 1.27
    39  C Intel NHM	 1.375	(up to about n = 260, then 1.5)
    40  C Intel SBR	 1.87
    41  C Intel atom	 ?
    42  C VIA nano	 ?
    43  
    44  
    45  C INPUT PARAMETERS
    46  define(`rp',	`%rdi')
    47  define(`up',	`%rsi')
    48  define(`n',	`%rdx')
    49  define(`cnt',	`%rcx')
    50  
    51  ABI_SUPPORT(DOS64)
    52  ABI_SUPPORT(STD64)
    53  
    54  ASM_START()
    55  	TEXT
    56  	ALIGN(16)
    57  PROLOGUE(mpn_lshift)
    58  	FUNC_ENTRY(4)
    59  	lea	-8(rp,n,8), rp
    60  	lea	-8(up,n,8), up
    61  
    62  	mov	R32(%rdx), R32(%rax)
    63  	and	$3, R32(%rax)
    64  	jne	L(nb00)
    65  L(b00):	C n = 4, 8, 12, ...
    66  	mov	(up), %r10
    67  	mov	-8(up), %r11
    68  	xor	R32(%rax), R32(%rax)
    69  	shld	R8(cnt), %r10, %rax
    70  	mov	-16(up), %r8
    71  	lea	24(rp), rp
    72  	sub	$4, n
    73  	jmp	L(00)
    74  
    75  L(nb00):C n = 1, 5, 9, ...
    76  	cmp	$2, R32(%rax)
    77  	jae	L(nb01)
    78  L(b01):	mov	(up), %r9
    79  	xor	R32(%rax), R32(%rax)
    80  	shld	R8(cnt), %r9, %rax
    81  	sub	$2, n
    82  	jb	L(le1)
    83  	mov	-8(up), %r10
    84  	mov	-16(up), %r11
    85  	lea	-8(up), up
    86  	lea	16(rp), rp
    87  	jmp	L(01)
    88  L(le1):	shl	R8(cnt), %r9
    89  	mov	%r9, (rp)
    90  	FUNC_EXIT()
    91  	ret
    92  
    93  L(nb01):C n = 2, 6, 10, ...
    94  	jne	L(b11)
    95  L(b10):	mov	(up), %r8
    96  	mov	-8(up), %r9
    97  	xor	R32(%rax), R32(%rax)
    98  	shld	R8(cnt), %r8, %rax
    99  	sub	$3, n
   100  	jb	L(le2)
   101  	mov	-16(up), %r10
   102  	lea	-16(up), up
   103  	lea	8(rp), rp
   104  	jmp	L(10)
   105  L(le2):	shld	R8(cnt), %r9, %r8
   106  	mov	%r8, (rp)
   107  	shl	R8(cnt), %r9
   108  	mov	%r9, -8(rp)
   109  	FUNC_EXIT()
   110  	ret
   111  
   112  	ALIGN(16)			C performance critical!
   113  L(b11):	C n = 3, 7, 11, ...
   114  	mov	(up), %r11
   115  	mov	-8(up), %r8
   116  	xor	R32(%rax), R32(%rax)
   117  	shld	R8(cnt), %r11, %rax
   118  	mov	-16(up), %r9
   119  	lea	-24(up), up
   120  	sub	$4, n
   121  	jb	L(end)
   122  
   123  	ALIGN(16)
   124  L(top):	shld	R8(cnt), %r8, %r11
   125  	mov	(up), %r10
   126  	mov	%r11, (rp)
   127  L(10):	shld	R8(cnt), %r9, %r8
   128  	mov	-8(up), %r11
   129  	mov	%r8, -8(rp)
   130  L(01):	shld	R8(cnt), %r10, %r9
   131  	mov	-16(up), %r8
   132  	mov	%r9, -16(rp)
   133  L(00):	shld	R8(cnt), %r11, %r10
   134  	mov	-24(up), %r9
   135  	mov	%r10, -24(rp)
   136  	add	$-32, up
   137  	lea	-32(rp), rp
   138  	sub	$4, n
   139  	jnc	L(top)
   140  
   141  L(end):	shld	R8(cnt), %r8, %r11
   142  	mov	%r11, (rp)
   143  	shld	R8(cnt), %r9, %r8
   144  	mov	%r8, -8(rp)
   145  	shl	R8(cnt), %r9
   146  	mov	%r9, -16(rp)
   147  	FUNC_EXIT()
   148  	ret
   149  EPILOGUE()