github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/lshift.asm (about)

     1  dnl  AMD64 mpn_lshift -- mpn left shift.
     2  
     3  dnl  Copyright 2003, 2005, 2007, 2009, 2011, 2012 Free Software Foundation,
     4  dnl  Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C	     cycles/limb   cycles/limb cnt=1
    36  C AMD K8,K9	 2.375		 1.375
    37  C AMD K10	 2.375		 1.375
    38  C Intel P4	 8		10.5
    39  C Intel core2	 2.11		 4.28
    40  C Intel corei	 ?		 ?
    41  C Intel atom	 5.75		 3.5
    42  C VIA nano	 3.5		 2.25
    43  
    44  
    45  C INPUT PARAMETERS
    46  define(`rp',	`%rdi')
    47  define(`up',	`%rsi')
    48  define(`n',	`%rdx')
    49  define(`cnt',	`%rcx')
    50  
    51  ABI_SUPPORT(DOS64)
    52  ABI_SUPPORT(STD64)
    53  
    54  ASM_START()
    55  	TEXT
    56  	ALIGN(32)
    57  PROLOGUE(mpn_lshift)
    58  	FUNC_ENTRY(4)
    59  	cmp	$1, R8(%rcx)
    60  	jne	L(gen)
    61  
    62  C For cnt=1 we want to work from lowest limb towards higher limbs.
    63  C Check for bad overlap (up=rp is OK!) up=rp+1..rp+n-1 is bad.
    64  C FIXME: this could surely be done more cleverly.
    65  
    66  	mov    rp, %rax
    67  	sub    up, %rax
    68  	je     L(fwd)			C rp = up
    69  	shr    $3, %rax
    70  	cmp    n, %rax
    71  	jb     L(gen)
    72  
    73  L(fwd):	mov	R32(n), R32(%rax)
    74  	shr	$2, n
    75  	je	L(e1)
    76  	and	$3, R32(%rax)
    77  
    78  	ALIGN(8)
    79  	nop
    80  	nop
    81  L(t1):	mov	(up), %r8
    82  	mov	8(up), %r9
    83  	mov	16(up), %r10
    84  	mov	24(up), %r11
    85  	lea	32(up), up
    86  	adc	%r8, %r8
    87  	mov	%r8, (rp)
    88  	adc	%r9, %r9
    89  	mov	%r9, 8(rp)
    90  	adc	%r10, %r10
    91  	mov	%r10, 16(rp)
    92  	adc	%r11, %r11
    93  	mov	%r11, 24(rp)
    94  	lea	32(rp), rp
    95  	dec	n
    96  	jne	L(t1)
    97  
    98  	inc	R32(%rax)
    99  	dec	R32(%rax)
   100  	jne	L(n00)
   101  	adc	R32(%rax), R32(%rax)
   102  	FUNC_EXIT()
   103  	ret
   104  L(e1):	test	R32(%rax), R32(%rax)	C clear cy
   105  L(n00):	mov	(up), %r8
   106  	dec	R32(%rax)
   107  	jne	L(n01)
   108  	adc	%r8, %r8
   109  	mov	%r8, (rp)
   110  L(ret):	adc	R32(%rax), R32(%rax)
   111  	FUNC_EXIT()
   112  	ret
   113  L(n01):	dec	R32(%rax)
   114  	mov	8(up), %r9
   115  	jne	L(n10)
   116  	adc	%r8, %r8
   117  	adc	%r9, %r9
   118  	mov	%r8, (rp)
   119  	mov	%r9, 8(rp)
   120  	adc	R32(%rax), R32(%rax)
   121  	FUNC_EXIT()
   122  	ret
   123  L(n10):	mov	16(up), %r10
   124  	adc	%r8, %r8
   125  	adc	%r9, %r9
   126  	adc	%r10, %r10
   127  	mov	%r8, (rp)
   128  	mov	%r9, 8(rp)
   129  	mov	%r10, 16(rp)
   130  	adc	$-1, R32(%rax)
   131  	FUNC_EXIT()
   132  	ret
   133  
   134  L(gen):	neg	R32(%rcx)		C put rsh count in cl
   135  	mov	-8(up,n,8), %rax
   136  	shr	R8(%rcx), %rax		C function return value
   137  
   138  	neg	R32(%rcx)		C put lsh count in cl
   139  	lea	1(n), R32(%r8)
   140  	and	$3, R32(%r8)
   141  	je	L(rlx)			C jump for n = 3, 7, 11, ...
   142  
   143  	dec	R32(%r8)
   144  	jne	L(1)
   145  C	n = 4, 8, 12, ...
   146  	mov	-8(up,n,8), %r10
   147  	shl	R8(%rcx), %r10
   148  	neg	R32(%rcx)		C put rsh count in cl
   149  	mov	-16(up,n,8), %r8
   150  	shr	R8(%rcx), %r8
   151  	or	%r8, %r10
   152  	mov	%r10, -8(rp,n,8)
   153  	dec	n
   154  	jmp	L(rll)
   155  
   156  L(1):	dec	R32(%r8)
   157  	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
   158  C	n = 2, 6, 10, 16, ...
   159  	mov	-8(up,n,8), %r10
   160  	shl	R8(%rcx), %r10
   161  	neg	R32(%rcx)		C put rsh count in cl
   162  	mov	-16(up,n,8), %r8
   163  	shr	R8(%rcx), %r8
   164  	or	%r8, %r10
   165  	mov	%r10, -8(rp,n,8)
   166  	dec	n
   167  	neg	R32(%rcx)		C put lsh count in cl
   168  L(1x):
   169  	cmp	$1, n
   170  	je	L(ast)
   171  	mov	-8(up,n,8), %r10
   172  	shl	R8(%rcx), %r10
   173  	mov	-16(up,n,8), %r11
   174  	shl	R8(%rcx), %r11
   175  	neg	R32(%rcx)		C put rsh count in cl
   176  	mov	-16(up,n,8), %r8
   177  	mov	-24(up,n,8), %r9
   178  	shr	R8(%rcx), %r8
   179  	or	%r8, %r10
   180  	shr	R8(%rcx), %r9
   181  	or	%r9, %r11
   182  	mov	%r10, -8(rp,n,8)
   183  	mov	%r11, -16(rp,n,8)
   184  	sub	$2, n
   185  
   186  L(rll):	neg	R32(%rcx)		C put lsh count in cl
   187  L(rlx):	mov	-8(up,n,8), %r10
   188  	shl	R8(%rcx), %r10
   189  	mov	-16(up,n,8), %r11
   190  	shl	R8(%rcx), %r11
   191  
   192  	sub	$4, n			C				      4
   193  	jb	L(end)			C				      2
   194  	ALIGN(16)
   195  L(top):
   196  	C finish stuff from lsh block
   197  	neg	R32(%rcx)		C put rsh count in cl
   198  	mov	16(up,n,8), %r8
   199  	mov	8(up,n,8), %r9
   200  	shr	R8(%rcx), %r8
   201  	or	%r8, %r10
   202  	shr	R8(%rcx), %r9
   203  	or	%r9, %r11
   204  	mov	%r10, 24(rp,n,8)
   205  	mov	%r11, 16(rp,n,8)
   206  	C start two new rsh
   207  	mov	0(up,n,8), %r8
   208  	mov	-8(up,n,8), %r9
   209  	shr	R8(%rcx), %r8
   210  	shr	R8(%rcx), %r9
   211  
   212  	C finish stuff from rsh block
   213  	neg	R32(%rcx)		C put lsh count in cl
   214  	mov	8(up,n,8), %r10
   215  	mov	0(up,n,8), %r11
   216  	shl	R8(%rcx), %r10
   217  	or	%r10, %r8
   218  	shl	R8(%rcx), %r11
   219  	or	%r11, %r9
   220  	mov	%r8, 8(rp,n,8)
   221  	mov	%r9, 0(rp,n,8)
   222  	C start two new lsh
   223  	mov	-8(up,n,8), %r10
   224  	mov	-16(up,n,8), %r11
   225  	shl	R8(%rcx), %r10
   226  	shl	R8(%rcx), %r11
   227  
   228  	sub	$4, n
   229  	jae	L(top)			C				      2
   230  L(end):
   231  	neg	R32(%rcx)		C put rsh count in cl
   232  	mov	8(up), %r8
   233  	shr	R8(%rcx), %r8
   234  	or	%r8, %r10
   235  	mov	(up), %r9
   236  	shr	R8(%rcx), %r9
   237  	or	%r9, %r11
   238  	mov	%r10, 16(rp)
   239  	mov	%r11, 8(rp)
   240  
   241  	neg	R32(%rcx)		C put lsh count in cl
   242  L(ast):	mov	(up), %r10
   243  	shl	R8(%rcx), %r10
   244  	mov	%r10, (rp)
   245  	FUNC_EXIT()
   246  	ret
   247  EPILOGUE()